Spaces:
Running
on
Zero
Running
on
Zero
add text preprocessing and onnx cpu inference alternative
Browse files- app.py +134 -9
- german_text_preprocessor.py +397 -0
app.py
CHANGED
|
@@ -3,8 +3,11 @@ import torch
|
|
| 3 |
import torch._inductor
|
| 4 |
import spaces
|
| 5 |
from char_tokenizers import GermanCharsTokenizer
|
|
|
|
| 6 |
from huggingface_hub import hf_hub_download
|
| 7 |
import os
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
# --- Download Model Files from Hugging Face ---
|
|
@@ -21,11 +24,15 @@ def download_models():
|
|
| 21 |
"karlsson_fastpitch_encoder.pt2",
|
| 22 |
"karlsson_fastpitch_decoder.pt2",
|
| 23 |
"karlsson_hifigan.pt2",
|
|
|
|
|
|
|
| 24 |
],
|
| 25 |
"Warholt/CaroTTS-60M-DE-Caro": [
|
| 26 |
"caro_fastpitch_encoder.pt2",
|
| 27 |
"caro_fastpitch_decoder.pt2",
|
| 28 |
"caro_hifigan.pt2",
|
|
|
|
|
|
|
| 29 |
],
|
| 30 |
}
|
| 31 |
|
|
@@ -97,7 +104,62 @@ MODELS = {
|
|
| 97 |
},
|
| 98 |
}
|
| 99 |
|
| 100 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
@spaces.GPU(duration=60)
|
| 102 |
def synthesize_speech(text: str, voice: str, pace: float = 1.0):
|
| 103 |
"""
|
|
@@ -105,10 +167,13 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
|
|
| 105 |
for the duration of this function.
|
| 106 |
"""
|
| 107 |
if not text.strip():
|
| 108 |
-
return None
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
# Tokenize text
|
| 111 |
-
tokens = TOKENIZER.encode(
|
| 112 |
tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
|
| 113 |
|
| 114 |
# Prepare control parameters
|
|
@@ -137,15 +202,29 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
|
|
| 137 |
sample_rate = 44100
|
| 138 |
audio_array = audio.squeeze().cpu().numpy()
|
| 139 |
|
| 140 |
-
return (sample_rate, audio_array)
|
| 141 |
|
| 142 |
|
| 143 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
|
| 145 |
gr.Markdown(
|
| 146 |
"""
|
| 147 |
# 🎙️ German Text-to-Speech
|
| 148 |
Generate German speech using two different voices: **Caro** and **Karlsson**.
|
|
|
|
| 149 |
"""
|
| 150 |
)
|
| 151 |
|
|
@@ -153,7 +232,7 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
|
|
| 153 |
with gr.Column():
|
| 154 |
text_input = gr.Textbox(
|
| 155 |
label="Text to synthesize",
|
| 156 |
-
value="
|
| 157 |
lines=3,
|
| 158 |
)
|
| 159 |
voice_dropdown = gr.Dropdown(
|
|
@@ -162,15 +241,61 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
|
|
| 162 |
pace_slider = gr.Slider(
|
| 163 |
minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
|
| 164 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
|
| 166 |
|
| 167 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
audio_output = gr.Audio(label="Generated Audio", type="numpy")
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
generate_btn.click(
|
| 171 |
-
fn=
|
| 172 |
-
inputs=[text_input, voice_dropdown, pace_slider],
|
| 173 |
-
outputs=audio_output,
|
| 174 |
)
|
| 175 |
|
| 176 |
if __name__ == "__main__":
|
|
|
|
| 3 |
import torch._inductor
|
| 4 |
import spaces
|
| 5 |
from char_tokenizers import GermanCharsTokenizer
|
| 6 |
+
from german_text_preprocessor import preprocess_german_text
|
| 7 |
from huggingface_hub import hf_hub_download
|
| 8 |
import os
|
| 9 |
+
import onnxruntime as ort
|
| 10 |
+
import numpy as np
|
| 11 |
|
| 12 |
|
| 13 |
# --- Download Model Files from Hugging Face ---
|
|
|
|
| 24 |
"karlsson_fastpitch_encoder.pt2",
|
| 25 |
"karlsson_fastpitch_decoder.pt2",
|
| 26 |
"karlsson_hifigan.pt2",
|
| 27 |
+
"karlsson_fastpitch.onnx",
|
| 28 |
+
"karlsson_hifigan.onnx",
|
| 29 |
],
|
| 30 |
"Warholt/CaroTTS-60M-DE-Caro": [
|
| 31 |
"caro_fastpitch_encoder.pt2",
|
| 32 |
"caro_fastpitch_decoder.pt2",
|
| 33 |
"caro_hifigan.pt2",
|
| 34 |
+
"caro_fastpitch.onnx",
|
| 35 |
+
"caro_hifigan.onnx",
|
| 36 |
],
|
| 37 |
}
|
| 38 |
|
|
|
|
| 104 |
},
|
| 105 |
}
|
| 106 |
|
| 107 |
+
# Initialize ONNX sessions for CPU inference
|
| 108 |
+
ONNX_SESSIONS = {
|
| 109 |
+
"Caro": {
|
| 110 |
+
"fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"),
|
| 111 |
+
"hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"),
|
| 112 |
+
},
|
| 113 |
+
"Karlsson": {
|
| 114 |
+
"fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"),
|
| 115 |
+
"hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"),
|
| 116 |
+
},
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# --- 3. CPU Inference Function (ONNX) ---
|
| 121 |
+
def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0):
|
| 122 |
+
"""
|
| 123 |
+
Synthesize speech using ONNX models on CPU.
|
| 124 |
+
"""
|
| 125 |
+
if not text.strip():
|
| 126 |
+
return None, ""
|
| 127 |
+
|
| 128 |
+
# Preprocess text
|
| 129 |
+
preprocessed_text = preprocess_german_text(text)
|
| 130 |
+
|
| 131 |
+
# Tokenize text
|
| 132 |
+
tokens = TOKENIZER.encode(preprocessed_text)
|
| 133 |
+
|
| 134 |
+
# Prepare inputs for FastPitch
|
| 135 |
+
paces = np.zeros(len(tokens), dtype=np.float32) + pace
|
| 136 |
+
pitches = np.zeros(len(tokens), dtype=np.float32)
|
| 137 |
+
|
| 138 |
+
inputs = {
|
| 139 |
+
"text": np.array([tokens], dtype=np.int64),
|
| 140 |
+
"pace": np.array([paces], dtype=np.float32),
|
| 141 |
+
"pitch": np.array([pitches], dtype=np.float32),
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
# Get ONNX sessions for the selected voice
|
| 145 |
+
fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"]
|
| 146 |
+
hifigan_session = ONNX_SESSIONS[voice]["hifigan"]
|
| 147 |
+
|
| 148 |
+
# Generate spectrogram with FastPitch
|
| 149 |
+
spec = fastpitch_session.run(None, inputs)[0]
|
| 150 |
+
|
| 151 |
+
# Generate audio with HiFiGAN
|
| 152 |
+
gan_inputs = {"spec": spec}
|
| 153 |
+
audio = hifigan_session.run(None, gan_inputs)[0]
|
| 154 |
+
|
| 155 |
+
# Convert to format expected by Gradio
|
| 156 |
+
sample_rate = 44100
|
| 157 |
+
audio_array = audio.squeeze()
|
| 158 |
+
|
| 159 |
+
return (sample_rate, audio_array), preprocessed_text
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# --- 4. GPU Inference Function ---
|
| 163 |
@spaces.GPU(duration=60)
|
| 164 |
def synthesize_speech(text: str, voice: str, pace: float = 1.0):
|
| 165 |
"""
|
|
|
|
| 167 |
for the duration of this function.
|
| 168 |
"""
|
| 169 |
if not text.strip():
|
| 170 |
+
return None, ""
|
| 171 |
+
|
| 172 |
+
# Preprocess text: convert numbers, dates, decimals to spoken form
|
| 173 |
+
preprocessed_text = preprocess_german_text(text)
|
| 174 |
|
| 175 |
# Tokenize text
|
| 176 |
+
tokens = TOKENIZER.encode(preprocessed_text)
|
| 177 |
tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
|
| 178 |
|
| 179 |
# Prepare control parameters
|
|
|
|
| 202 |
sample_rate = 44100
|
| 203 |
audio_array = audio.squeeze().cpu().numpy()
|
| 204 |
|
| 205 |
+
return (sample_rate, audio_array), preprocessed_text
|
| 206 |
|
| 207 |
|
| 208 |
+
# --- 5. Combined Inference Function ---
|
| 209 |
+
def synthesize_speech_combined(
|
| 210 |
+
text: str, voice: str, pace: float = 1.0, use_gpu: bool = False
|
| 211 |
+
):
|
| 212 |
+
"""
|
| 213 |
+
Route to GPU or CPU inference based on user selection.
|
| 214 |
+
"""
|
| 215 |
+
if use_gpu:
|
| 216 |
+
return synthesize_speech(text, voice, pace)
|
| 217 |
+
else:
|
| 218 |
+
return synthesize_speech_cpu(text, voice, pace)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# --- 6. Gradio Interface ---
|
| 222 |
with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
|
| 223 |
gr.Markdown(
|
| 224 |
"""
|
| 225 |
# 🎙️ German Text-to-Speech
|
| 226 |
Generate German speech using two different voices: **Caro** and **Karlsson**.
|
| 227 |
+
Numbers, dates, and decimals are automatically converted to spoken form.
|
| 228 |
"""
|
| 229 |
)
|
| 230 |
|
|
|
|
| 232 |
with gr.Column():
|
| 233 |
text_input = gr.Textbox(
|
| 234 |
label="Text to synthesize",
|
| 235 |
+
value="Guten Tag. Herzlich Willkommen zu dieser Demonstration deutscher Sprachsynthese-Modelle. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Probieren Sie es aus!",
|
| 236 |
lines=3,
|
| 237 |
)
|
| 238 |
voice_dropdown = gr.Dropdown(
|
|
|
|
| 241 |
pace_slider = gr.Slider(
|
| 242 |
minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
|
| 243 |
)
|
| 244 |
+
use_gpu_checkbox = gr.Checkbox(
|
| 245 |
+
label="Use GPU (ZeroGPU)",
|
| 246 |
+
value=False,
|
| 247 |
+
info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).",
|
| 248 |
+
)
|
| 249 |
generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
|
| 250 |
|
| 251 |
with gr.Column():
|
| 252 |
+
preprocessed_output = gr.Textbox(
|
| 253 |
+
label="Preprocessed Text (what will be spoken)",
|
| 254 |
+
lines=3,
|
| 255 |
+
interactive=False,
|
| 256 |
+
)
|
| 257 |
audio_output = gr.Audio(label="Generated Audio", type="numpy")
|
| 258 |
|
| 259 |
+
# Example sentences section
|
| 260 |
+
gr.Markdown("### 📝 Example Sentences")
|
| 261 |
+
gr.Examples(
|
| 262 |
+
examples=[
|
| 263 |
+
[
|
| 264 |
+
"Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden."
|
| 265 |
+
],
|
| 266 |
+
[
|
| 267 |
+
"Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene."
|
| 268 |
+
],
|
| 269 |
+
[
|
| 270 |
+
"In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten."
|
| 271 |
+
],
|
| 272 |
+
[
|
| 273 |
+
"Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges."
|
| 274 |
+
],
|
| 275 |
+
[
|
| 276 |
+
"Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut."
|
| 277 |
+
],
|
| 278 |
+
[
|
| 279 |
+
"Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße."
|
| 280 |
+
],
|
| 281 |
+
[
|
| 282 |
+
"Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet."
|
| 283 |
+
],
|
| 284 |
+
[
|
| 285 |
+
"Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar."
|
| 286 |
+
],
|
| 287 |
+
[
|
| 288 |
+
"Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte."
|
| 289 |
+
],
|
| 290 |
+
],
|
| 291 |
+
inputs=text_input,
|
| 292 |
+
label="Try these examples:",
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
generate_btn.click(
|
| 296 |
+
fn=synthesize_speech_combined,
|
| 297 |
+
inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox],
|
| 298 |
+
outputs=[audio_output, preprocessed_output],
|
| 299 |
)
|
| 300 |
|
| 301 |
if __name__ == "__main__":
|
german_text_preprocessor.py
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
German Text Preprocessing Module for TTS
|
| 3 |
+
Handles normalization of numbers, dates, decimal numbers, and other text elements
|
| 4 |
+
to their spoken form in German.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class GermanTextPreprocessor:
|
| 11 |
+
"""
|
| 12 |
+
Preprocesses German text for TTS by converting numbers, dates, and special
|
| 13 |
+
characters into their spoken equivalents.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
# Number words for German
|
| 17 |
+
ONES = {
|
| 18 |
+
0: "", 1: "eins", 2: "zwei", 3: "drei", 4: "vier",
|
| 19 |
+
5: "fünf", 6: "sechs", 7: "sieben", 8: "acht", 9: "neun"
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# Digit names for reading individual digits (including zero)
|
| 23 |
+
DIGITS = {
|
| 24 |
+
0: "null", 1: "eins", 2: "zwei", 3: "drei", 4: "vier",
|
| 25 |
+
5: "fünf", 6: "sechs", 7: "sieben", 8: "acht", 9: "neun"
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
TEENS = {
|
| 29 |
+
10: "zehn", 11: "elf", 12: "zwölf", 13: "dreizehn",
|
| 30 |
+
14: "vierzehn", 15: "fünfzehn", 16: "sechzehn",
|
| 31 |
+
17: "siebzehn", 18: "achtzehn", 19: "neunzehn"
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
TENS = {
|
| 35 |
+
2: "zwanzig", 3: "dreißig", 4: "vierzig",
|
| 36 |
+
5: "fünfzig", 6: "sechzig", 7: "siebzig",
|
| 37 |
+
8: "achtzig", 9: "neunzig"
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
SCALES = [
|
| 41 |
+
(1000000000, "Milliarde", "Milliarden"),
|
| 42 |
+
(1000000, "Million", "Millionen"),
|
| 43 |
+
(1000, "tausend", "tausend")
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
# Ordinal number endings
|
| 47 |
+
ORDINAL_ONES = {
|
| 48 |
+
1: "erster", 2: "zweiter", 3: "dritter", 4: "vierter",
|
| 49 |
+
5: "fünfter", 6: "sechster", 7: "siebter", 8: "achter", 9: "neunter"
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
ORDINAL_TEENS = {
|
| 53 |
+
10: "zehnter", 11: "elfter", 12: "zwölfter", 13: "dreizehnter",
|
| 54 |
+
14: "vierzehnter", 15: "fünfzehnter", 16: "sechzehnter",
|
| 55 |
+
17: "siebzehnter", 18: "achtzehnter", 19: "neunzehnter"
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
# Month names
|
| 59 |
+
MONTHS = {
|
| 60 |
+
1: "Januar", 2: "Februar", 3: "März", 4: "April",
|
| 61 |
+
5: "Mai", 6: "Juni", 7: "Juli", 8: "August",
|
| 62 |
+
9: "September", 10: "Oktober", 11: "November", 12: "Dezember"
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
MONTH_ABBREV = {
|
| 66 |
+
"jan": "Januar", "feb": "Februar", "mär": "März", "apr": "April",
|
| 67 |
+
"mai": "Mai", "jun": "Juni", "jul": "Juli", "aug": "August",
|
| 68 |
+
"sep": "September", "sept": "September", "okt": "Oktober",
|
| 69 |
+
"nov": "November", "dez": "Dezember"
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
def __init__(self):
|
| 73 |
+
"""Initialize the German text preprocessor."""
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
def _number_to_words(self, num: int) -> str:
|
| 77 |
+
"""
|
| 78 |
+
Convert a cardinal number to its German word form.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
num: Integer to convert
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
German word representation of the number
|
| 85 |
+
"""
|
| 86 |
+
if num == 0:
|
| 87 |
+
return "null"
|
| 88 |
+
|
| 89 |
+
if num < 0:
|
| 90 |
+
return "minus " + self._number_to_words(-num)
|
| 91 |
+
|
| 92 |
+
# Handle 1-9
|
| 93 |
+
if num < 10:
|
| 94 |
+
return self.ONES[num]
|
| 95 |
+
|
| 96 |
+
# Handle 10-19
|
| 97 |
+
if num < 20:
|
| 98 |
+
return self.TEENS[num]
|
| 99 |
+
|
| 100 |
+
# Handle 20-99
|
| 101 |
+
if num < 100:
|
| 102 |
+
ones = num % 10
|
| 103 |
+
tens = num // 10
|
| 104 |
+
if ones == 0:
|
| 105 |
+
return self.TENS[tens]
|
| 106 |
+
else:
|
| 107 |
+
ones_word = self.ONES[ones]
|
| 108 |
+
# Special case: "eins" becomes "ein" in compound numbers
|
| 109 |
+
if ones == 1:
|
| 110 |
+
ones_word = "ein"
|
| 111 |
+
return f"{ones_word}und{self.TENS[tens]}"
|
| 112 |
+
|
| 113 |
+
# Handle 100-999
|
| 114 |
+
if num < 1000:
|
| 115 |
+
hundreds = num // 100
|
| 116 |
+
remainder = num % 100
|
| 117 |
+
hundreds_word = "einhundert" if hundreds == 1 else f"{self.ONES[hundreds]}hundert"
|
| 118 |
+
if remainder == 0:
|
| 119 |
+
return hundreds_word
|
| 120 |
+
return f"{hundreds_word}{self._number_to_words(remainder)}"
|
| 121 |
+
|
| 122 |
+
# Handle larger numbers using scales
|
| 123 |
+
for scale, singular, plural in self.SCALES:
|
| 124 |
+
if num >= scale:
|
| 125 |
+
quotient = num // scale
|
| 126 |
+
remainder = num % scale
|
| 127 |
+
|
| 128 |
+
# Format the quotient part
|
| 129 |
+
quotient_words = self._number_to_words(quotient)
|
| 130 |
+
|
| 131 |
+
# Choose singular or plural
|
| 132 |
+
if scale == 1000:
|
| 133 |
+
scale_word = singular
|
| 134 |
+
# Special formatting for thousands
|
| 135 |
+
if quotient == 1:
|
| 136 |
+
scale_word = "eintausend"
|
| 137 |
+
else:
|
| 138 |
+
scale_word = f"{quotient_words}tausend"
|
| 139 |
+
|
| 140 |
+
if remainder == 0:
|
| 141 |
+
return scale_word
|
| 142 |
+
return f"{scale_word}{self._number_to_words(remainder)}"
|
| 143 |
+
else:
|
| 144 |
+
scale_word = singular if quotient == 1 else plural
|
| 145 |
+
if quotient == 1:
|
| 146 |
+
result = f"eine {scale_word}"
|
| 147 |
+
else:
|
| 148 |
+
result = f"{quotient_words} {scale_word}"
|
| 149 |
+
|
| 150 |
+
if remainder == 0:
|
| 151 |
+
return result
|
| 152 |
+
return f"{result} {self._number_to_words(remainder)}"
|
| 153 |
+
|
| 154 |
+
return str(num)
|
| 155 |
+
|
| 156 |
+
def _year_to_words(self, year: int) -> str:
|
| 157 |
+
"""
|
| 158 |
+
Convert a year to its German spoken form.
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
year: Year as integer (e.g., 1994, 2019)
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
German spoken form of the year
|
| 165 |
+
"""
|
| 166 |
+
# For years 1000-1999, split into hundreds
|
| 167 |
+
if 1000 <= year <= 1999:
|
| 168 |
+
hundreds = year // 100
|
| 169 |
+
remainder = year % 100
|
| 170 |
+
|
| 171 |
+
if remainder == 0:
|
| 172 |
+
return self._number_to_words(year)
|
| 173 |
+
|
| 174 |
+
# Create compound like "neunzehnhundertvierundneunzig"
|
| 175 |
+
hundreds_word = self._number_to_words(hundreds)
|
| 176 |
+
return f"{hundreds_word}hundert{self._number_to_words(remainder)}"
|
| 177 |
+
|
| 178 |
+
# For years 2000+, use normal number reading
|
| 179 |
+
return self._number_to_words(year)
|
| 180 |
+
|
| 181 |
+
def _ordinal_to_words(self, num: int) -> str:
|
| 182 |
+
"""
|
| 183 |
+
Convert a number to its German ordinal form.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
num: Integer to convert to ordinal
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
German ordinal word
|
| 190 |
+
"""
|
| 191 |
+
if num < 1:
|
| 192 |
+
return self._number_to_words(num) + "ter"
|
| 193 |
+
|
| 194 |
+
# Handle 1-9
|
| 195 |
+
if num < 10:
|
| 196 |
+
return self.ORDINAL_ONES.get(num, self._number_to_words(num) + "ter")
|
| 197 |
+
|
| 198 |
+
# Handle 10-19
|
| 199 |
+
if num < 20:
|
| 200 |
+
return self.ORDINAL_TEENS.get(num, self._number_to_words(num) + "ter")
|
| 201 |
+
|
| 202 |
+
# For larger numbers, add "ter" to the cardinal
|
| 203 |
+
return self._number_to_words(num) + "ter"
|
| 204 |
+
|
| 205 |
+
def _process_decimal(self, match: re.Match) -> str:
|
| 206 |
+
"""
|
| 207 |
+
Process decimal numbers like "3,1415" -> "drei komma eins vier eins fünf"
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
match: Regex match object containing the decimal number
|
| 211 |
+
|
| 212 |
+
Returns:
|
| 213 |
+
Spoken form of the decimal number
|
| 214 |
+
"""
|
| 215 |
+
full_number = match.group(0)
|
| 216 |
+
parts = full_number.split(',')
|
| 217 |
+
|
| 218 |
+
# Integer part
|
| 219 |
+
integer_part = int(parts[0]) if parts[0] else 0
|
| 220 |
+
result = self._number_to_words(integer_part)
|
| 221 |
+
|
| 222 |
+
# Decimal part - read digit by digit (including zeros)
|
| 223 |
+
if len(parts) > 1 and parts[1]:
|
| 224 |
+
result += " komma"
|
| 225 |
+
for digit in parts[1]:
|
| 226 |
+
result += " " + self.DIGITS[int(digit)]
|
| 227 |
+
|
| 228 |
+
return result
|
| 229 |
+
|
| 230 |
+
def _process_date(self, match: re.Match) -> str:
|
| 231 |
+
"""
|
| 232 |
+
Process dates in various formats:
|
| 233 |
+
- "20.11.2019" -> "zwanzigster elfter zweitausendneunzehn"
|
| 234 |
+
- "1. Jan. 1994" -> "erster Januar neunzehnhundertvierundneunzig"
|
| 235 |
+
|
| 236 |
+
Args:
|
| 237 |
+
match: Regex match object containing the date
|
| 238 |
+
|
| 239 |
+
Returns:
|
| 240 |
+
Spoken form of the date
|
| 241 |
+
"""
|
| 242 |
+
date_str = match.group(0)
|
| 243 |
+
|
| 244 |
+
# Pattern 1: DD.MM.YYYY or D.M.YYYY
|
| 245 |
+
pattern1 = r'(\d{1,2})\.(\d{1,2})\.(\d{4})'
|
| 246 |
+
m1 = re.match(pattern1, date_str)
|
| 247 |
+
if m1:
|
| 248 |
+
day = int(m1.group(1))
|
| 249 |
+
month = int(m1.group(2))
|
| 250 |
+
year = int(m1.group(3))
|
| 251 |
+
|
| 252 |
+
day_word = self._ordinal_to_words(day)
|
| 253 |
+
month_word = self._ordinal_to_words(month)
|
| 254 |
+
year_word = self._year_to_words(year)
|
| 255 |
+
|
| 256 |
+
return f"{day_word} {month_word} {year_word}"
|
| 257 |
+
|
| 258 |
+
# Pattern 2: D. Mon. YYYY or DD. Month YYYY
|
| 259 |
+
pattern2 = r'(\d{1,2})\.\s*([A-Za-zä]+)\.?\s*(\d{4})'
|
| 260 |
+
m2 = re.match(pattern2, date_str)
|
| 261 |
+
if m2:
|
| 262 |
+
day = int(m2.group(1))
|
| 263 |
+
month_str = m2.group(2).lower()
|
| 264 |
+
year = int(m2.group(3))
|
| 265 |
+
|
| 266 |
+
day_word = self._ordinal_to_words(day)
|
| 267 |
+
|
| 268 |
+
# Try to find month
|
| 269 |
+
month_word = self.MONTH_ABBREV.get(month_str, month_str)
|
| 270 |
+
year_word = self._year_to_words(year)
|
| 271 |
+
|
| 272 |
+
return f"{day_word} {month_word} {year_word}"
|
| 273 |
+
|
| 274 |
+
# Pattern 3: Just DD.MM or D.M (without year)
|
| 275 |
+
pattern3 = r'(\d{1,2})\.(\d{1,2})\.'
|
| 276 |
+
m3 = re.match(pattern3, date_str)
|
| 277 |
+
if m3:
|
| 278 |
+
day = int(m3.group(1))
|
| 279 |
+
month = int(m3.group(2))
|
| 280 |
+
|
| 281 |
+
day_word = self._ordinal_to_words(day)
|
| 282 |
+
month_word = self._ordinal_to_words(month)
|
| 283 |
+
|
| 284 |
+
return f"{day_word} {month_word}"
|
| 285 |
+
|
| 286 |
+
return date_str
|
| 287 |
+
|
| 288 |
+
def _process_standalone_number(self, match: re.Match) -> str:
|
| 289 |
+
"""
|
| 290 |
+
Process standalone cardinal numbers.
|
| 291 |
+
|
| 292 |
+
Args:
|
| 293 |
+
match: Regex match object containing the number
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
Spoken form of the number
|
| 297 |
+
"""
|
| 298 |
+
num_str = match.group(0)
|
| 299 |
+
num = int(num_str)
|
| 300 |
+
return self._number_to_words(num)
|
| 301 |
+
|
| 302 |
+
def preprocess(self, text: str) -> str:
|
| 303 |
+
"""
|
| 304 |
+
Main preprocessing function that applies all transformations.
|
| 305 |
+
|
| 306 |
+
Args:
|
| 307 |
+
text: Input German text
|
| 308 |
+
|
| 309 |
+
Returns:
|
| 310 |
+
Preprocessed text with numbers, dates, etc. converted to spoken form
|
| 311 |
+
"""
|
| 312 |
+
# Order matters! More specific patterns first
|
| 313 |
+
|
| 314 |
+
# 1. Process dates (must come before decimal and integer processing)
|
| 315 |
+
# Pattern: DD.MM.YYYY or D.M.YYYY
|
| 316 |
+
text = re.sub(
|
| 317 |
+
r'\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b',
|
| 318 |
+
self._process_date,
|
| 319 |
+
text
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
# Pattern: D. Month YYYY or DD. Mon. YYYY
|
| 323 |
+
text = re.sub(
|
| 324 |
+
r'\b(\d{1,2})\.\s*([A-Za-zäöüÄÖÜ]+)\.?\s*(\d{4})\b',
|
| 325 |
+
self._process_date,
|
| 326 |
+
text
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
# Pattern: DD.MM. or D.M.
|
| 330 |
+
text = re.sub(
|
| 331 |
+
r'\b(\d{1,2})\.(\d{1,2})\.',
|
| 332 |
+
self._process_date,
|
| 333 |
+
text
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# 2. Process decimal numbers (before integers)
|
| 337 |
+
# Pattern: number,digits (e.g., 3,1415 or 0,5)
|
| 338 |
+
text = re.sub(
|
| 339 |
+
r'\b\d+,\d+\b',
|
| 340 |
+
self._process_decimal,
|
| 341 |
+
text
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
# 3. Process standalone integers (cardinal numbers)
|
| 345 |
+
# This will catch remaining numbers not processed by date/decimal patterns
|
| 346 |
+
text = re.sub(
|
| 347 |
+
r'\b\d+\b',
|
| 348 |
+
self._process_standalone_number,
|
| 349 |
+
text
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
# 4. Clean up any extra whitespace
|
| 353 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 354 |
+
|
| 355 |
+
return text
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
# Convenience function for easy import and use
|
| 359 |
+
def preprocess_german_text(text: str) -> str:
|
| 360 |
+
"""
|
| 361 |
+
Convenience function to preprocess German text.
|
| 362 |
+
|
| 363 |
+
Args:
|
| 364 |
+
text: Input German text
|
| 365 |
+
|
| 366 |
+
Returns:
|
| 367 |
+
Preprocessed text with numbers, dates, etc. in spoken form
|
| 368 |
+
"""
|
| 369 |
+
preprocessor = GermanTextPreprocessor()
|
| 370 |
+
return preprocessor.preprocess(text)
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
# Example usage and testing
|
| 374 |
+
if __name__ == "__main__":
|
| 375 |
+
preprocessor = GermanTextPreprocessor()
|
| 376 |
+
|
| 377 |
+
test_cases = [
|
| 378 |
+
"Die Zahl ist 3",
|
| 379 |
+
"Heute ist der 20.11.2019",
|
| 380 |
+
"Geboren am 1. Jan. 1994",
|
| 381 |
+
"Pi ist ungefähr 3,1415",
|
| 382 |
+
"Es sind 42 Studenten in der Klasse",
|
| 383 |
+
"Das Jahr 2023 war interessant",
|
| 384 |
+
"Der Preis beträgt 19,99 Euro",
|
| 385 |
+
"Am 5.12. ist Nikolaus",
|
| 386 |
+
"Die Temperatur ist -5 Grad",
|
| 387 |
+
"Es gibt 1000000 Möglichkeiten",
|
| 388 |
+
"Im Jahr 1789 begann die Revolution",
|
| 389 |
+
]
|
| 390 |
+
|
| 391 |
+
print("German Text Preprocessing Examples:")
|
| 392 |
+
print("=" * 80)
|
| 393 |
+
for text in test_cases:
|
| 394 |
+
processed = preprocessor.preprocess(text)
|
| 395 |
+
print(f"Input: {text}")
|
| 396 |
+
print(f"Output: {processed}")
|
| 397 |
+
print("-" * 80)
|