Spaces:

Warholt
/

CaroTTS-DE

Running on Zero

App Files Files Community

CaroTTS-DE / app.py

Warholt

add character counter

822739a 16 days ago

raw

history blame contribute delete

10.9 kB

	import gradio as gr
	import torch
	import torch._inductor
	import spaces
	from char_tokenizers import GermanCharsTokenizer
	from german_text_preprocessor import preprocess_german_text
	from huggingface_hub import hf_hub_download
	import os
	import onnxruntime as ort
	import numpy as np


	# --- Download Model Files from Hugging Face ---
	def download_models():
	"""
	Download model files from Hugging Face repositories at startup.
	Files are downloaded to the aot_package folder.
	"""
	os.makedirs("aot_package", exist_ok=True)

	# Define the models and their files
	models_config = {
	"Warholt/CaroTTS-60M-DE-Karlsson": [
	"karlsson_fastpitch_encoder.pt2",
	"karlsson_fastpitch_decoder.pt2",
	"karlsson_hifigan.pt2",
	"karlsson_fastpitch.onnx",
	"karlsson_hifigan.onnx",
	],
	"Warholt/CaroTTS-60M-DE-Caro": [
	"caro_fastpitch_encoder.pt2",
	"caro_fastpitch_decoder.pt2",
	"caro_hifigan.pt2",
	"caro_fastpitch.onnx",
	"caro_hifigan.onnx",
	],
	}

	print("Downloading model files from Hugging Face...")
	for repo_id, files in models_config.items():
	for filename in files:
	print(f" Downloading {filename} from {repo_id}...")
	hf_hub_download(
	repo_id=repo_id,
	filename=filename,
	local_dir="aot_package",
	local_dir_use_symlinks=False,
	)
	print("All model files downloaded successfully!")


	# Download models at startup
	download_models()

	# --- 1. Define a Wrapper for Lazy Loading ---
	class LazyAotPackage(torch.nn.Module):
	"""
	A wrapper that holds the path to an AOT package and loads it
	to the GPU only when forward() is called.
	"""

	def __init__(self, package_path):
	super().__init__()
	self.package_path = package_path
	self.runner = None

	def forward(self, args, *kwargs):
	# We are now inside the @spaces.GPU decorated function.
	# Valid GPU context exists.

	# If runner is not loaded, load it now.
	if self.runner is None:
	# Load directly to the active CUDA device
	self.runner = torch._inductor.aoti_load_package(self.package_path)

	# Run inference
	# We add a try/except block because if ZeroGPU swaps the underlying hardware
	# between requests, the old runner might be invalid.
	try:
	return self.runner(args, *kwargs)
	except RuntimeError:
	# Context might be stale, reload
	self.runner = torch._inductor.aoti_load_package(
	self.package_path, device="cuda"
	)
	return self.runner(args, *kwargs)


	# --- 2. Initialize Global Components ---
	TOKENIZER = GermanCharsTokenizer()

	# Instead of a dict of raw paths, we instantiate our Lazy Loaders immediately.
	# These act like standard PyTorch modules but use almost no RAM until inference.
	MODELS = {
	"Caro": {
	"encoder": LazyAotPackage("aot_package/caro_fastpitch_encoder.pt2"),
	"decoder": LazyAotPackage("aot_package/caro_fastpitch_decoder.pt2"),
	"vocoder": LazyAotPackage("aot_package/caro_hifigan.pt2"),
	},
	"Karlsson": {
	"encoder": LazyAotPackage("aot_package/karlsson_fastpitch_encoder.pt2"),
	"decoder": LazyAotPackage("aot_package/karlsson_fastpitch_decoder.pt2"),
	"vocoder": LazyAotPackage("aot_package/karlsson_hifigan.pt2"),
	},
	}

	# Initialize ONNX sessions for CPU inference
	ONNX_SESSIONS = {
	"Caro": {
	"fastpitch": ort.InferenceSession("aot_package/caro_fastpitch.onnx"),
	"hifigan": ort.InferenceSession("aot_package/caro_hifigan.onnx"),
	},
	"Karlsson": {
	"fastpitch": ort.InferenceSession("aot_package/karlsson_fastpitch.onnx"),
	"hifigan": ort.InferenceSession("aot_package/karlsson_hifigan.onnx"),
	},
	}


	# --- 3. CPU Inference Function (ONNX) ---
	def synthesize_speech_cpu(text: str, voice: str, pace: float = 1.0):
	"""
	Synthesize speech using ONNX models on CPU.
	"""
	if not text.strip():
	return None

	# Preprocess text
	preprocessed_text = preprocess_german_text(text)

	# Tokenize text
	tokens = TOKENIZER.encode(preprocessed_text)

	# Prepare inputs for FastPitch
	paces = np.zeros(len(tokens), dtype=np.float32) + pace
	pitches = np.zeros(len(tokens), dtype=np.float32)

	inputs = {
	"text": np.array([tokens], dtype=np.int64),
	"pace": np.array([paces], dtype=np.float32),
	"pitch": np.array([pitches], dtype=np.float32),
	}

	# Get ONNX sessions for the selected voice
	fastpitch_session = ONNX_SESSIONS[voice]["fastpitch"]
	hifigan_session = ONNX_SESSIONS[voice]["hifigan"]

	# Generate spectrogram with FastPitch
	spec = fastpitch_session.run(None, inputs)[0]

	# Generate audio with HiFiGAN
	gan_inputs = {"spec": spec}
	audio = hifigan_session.run(None, gan_inputs)[0]

	# Convert to format expected by Gradio
	sample_rate = 44100
	audio_array = audio.squeeze()

	return (sample_rate, audio_array)


	# --- 4. GPU Inference Function ---
	@spaces.GPU(duration=60)
	def synthesize_speech(text: str, voice: str, pace: float = 1.0):
	"""
	Synthesize speech. The @spaces.GPU decorator ensures a GPU is assigned
	for the duration of this function.
	"""
	if not text.strip():
	return None

	# Preprocess text: convert numbers, dates, decimals to spoken form
	preprocessed_text = preprocess_german_text(text)

	# Tokenize text
	tokens = TOKENIZER.encode(preprocessed_text)
	tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")

	# Prepare control parameters
	pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32).to("cuda")
	pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32).to("cuda") * pace

	# Retrieve the correct lazy-loaded models
	# The .forward() call inside these objects will trigger the load to GPU
	encoder = MODELS[voice]["encoder"]
	decoder = MODELS[voice]["decoder"]
	vocoder = MODELS[voice]["vocoder"]

	with torch.inference_mode():
	# 1. Run Encoder (Loads .pt2 to GPU if needed -> Runs)
	len_regulated, dec_lens, spk_emb = encoder(
	tokens_tensor, pitch_tensor, pace_tensor
	)

	# 2. Run Decoder (Loads .pt2 to GPU if needed -> Runs)
	spec = decoder(len_regulated, dec_lens, spk_emb)

	# 3. Run Vocoder (Loads .pt2 to GPU if needed -> Runs)
	audio = vocoder(spec)

	# Convert to numpy and return
	sample_rate = 44100
	audio_array = audio.squeeze().cpu().numpy()

	return (sample_rate, audio_array)


	# --- 5. Combined Inference Function ---
	def synthesize_speech_combined(
	text: str, voice: str, pace: float = 1.0, use_gpu: bool = False
	):
	"""
	Route to GPU or CPU inference based on user selection.
	"""
	if use_gpu:
	return synthesize_speech(text, voice, pace)
	else:
	return synthesize_speech_cpu(text, voice, pace)


	# --- 6. Gradio Interface ---
	with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
	gr.Markdown(
	"""
	# 🎙️ German Text-to-Speech
	Generate German speech using two different voices: Caro and Karlsson.
	Numbers, dates, and decimals are automatically converted to spoken form.
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to synthesize",
	value="Guten Tag. Herzlich Willkommen zu dieser Demonstration. Es stehen Ihnen zwei Stimmen zur Auswahl: Caro und Karlsson. Sie können außerdem die Sprechgeschwindigkeit anpassen. Unten finden Sie ein Paar Beispielsätze. Probieren Sie es aus!",
	lines=3,
	max_length=1024,
	)
	char_counter = gr.Markdown("Characters: 0 / 1024")
	voice_dropdown = gr.Dropdown(
	choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
	)
	pace_slider = gr.Slider(
	minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
	)
	use_gpu_checkbox = gr.Checkbox(
	label="Use GPU (ZeroGPU)",
	value=True,
	info="Enable for faster inference on GPU. Disable for CPU inference (slower but always available).",
	)
	generate_btn = gr.Button("Generate Speech 🔊", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Generated Audio", type="numpy")

	# Example sentences section
	gr.Markdown("### 📝 Example Sentences")
	gr.Examples(
	examples=[
	[
	"Die Bundeskanzlerin empfing heute den französischen Präsidenten zu einem Staatsbesuch in Berlin. Die Gespräche dauerten mehr als 3 Stunden."
	],
	[
	"Am 15. März 2024 wird die neue Ausstellung im Museum eröffnet. Der Eintritt kostet 12,50 Euro für Erwachsene."
	],
	[
	"In der verzauberten Bibliothek entdeckte die junge Magierin ein uraltes Buch, dessen Seiten im Mondlicht golden schimmerten."
	],
	[
	"Der mutige Ritter zog sein Schwert und stellte sich dem feuerspeienden Drachen. Ein epischer Kampf begann auf dem Gipfel des Berges."
	],
	[
	"Wussten Sie, dass die Große Mauer in China über 21000 Kilometer lang ist? Sie wurde über 2000 Jahre hinweg erbaut."
	],
	[
	"Der menschliche Körper besteht zu etwa 60 Prozent aus Wasser. Ein erwachsener Mensch hat ungefähr 100000 Kilometer Blutgefäße."
	],
	[
	"Die Temperaturen steigen heute auf bis zu 28 Grad Celsius. Am Wochenende wird mit Schauern und Gewittern gerechnet."
	],
	[
	"Der Dax schloss heute bei 18456,73 Punkten, ein Plus von 2,3 Prozent. Der Euro notiert bei 1,0892 Dollar."
	],
	[
	"Es war einmal in einem fernen Königreich, wo die Zeit anders verlief und die Sterne näher schienen. Dort lebte eine weise Eule, die alle Geheimnisse des Waldes kannte."
	],
	],
	inputs=text_input,
	label="Try these examples:",
	)

	# Update character counter
	def update_char_count(text):
	count = len(text)
	return f"Characters: {count} / 1024"

	text_input.change(
	fn=update_char_count,
	inputs=text_input,
	outputs=char_counter,
	)

	generate_btn.click(
	fn=synthesize_speech_combined,
	inputs=[text_input, voice_dropdown, pace_slider, use_gpu_checkbox],
	outputs=audio_output,
	)

	if __name__ == "__main__":
	demo.launch()