Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import time | |
import re | |
import os | |
import soundfile as sf | |
import warnings | |
from kokoro_onnx import Kokoro | |
from kokoro_onnx.tokenizer import Tokenizer | |
# Suppress warnings | |
warnings.filterwarnings("ignore") | |
# Initialize tokenizer and model | |
tokenizer = Tokenizer() | |
kokoro = Kokoro("onnx_deps/kokoro-v1.0.onnx", "onnx_deps/voices-v1.0.bin") | |
# Constants | |
SUPPORTED_LANGUAGES = ["en-us", "en-gb", "es", "fr-fr", "hi", "it", "ja", "pt-br", "zh"] | |
AUDIO_DIR = "audio_exports" | |
CURRENT_VOICE = "af_sky" # Default voice | |
# Create output directory if it doesn't exist | |
os.makedirs(AUDIO_DIR, exist_ok=True) | |
# Split pattern presets | |
SPLIT_PATTERNS = { | |
"Paragraphs (one or more newlines)": r"\n+", | |
"Sentences (periods, question marks, exclamation points)": r"(?<=[.!?])\s+", | |
"Commas and semicolons": r"[,;]\s+", | |
"No splitting (process as one chunk)": r"$^", # Pattern that won't match anything | |
"Custom": "custom", | |
} | |
def preview_text_splitting(text, split_pattern): | |
""" | |
Preview how text will be split based on the pattern | |
""" | |
try: | |
if split_pattern == "$^": # Special case for no splitting | |
return [text] | |
chunks = re.split(split_pattern, text) | |
# Filter out empty chunks | |
chunks = [chunk.strip() for chunk in chunks if chunk.strip()] | |
return chunks | |
except Exception as e: | |
return [f"Error previewing split: {e}"] | |
def run_performance_tests(text, voice, language, split_pattern, speed): | |
""" | |
Run performance tests comparing different approaches | |
Returns: | |
String with detailed test results | |
""" | |
results = [] | |
results.append("=== KOKORO-ONNX PERFORMANCE TEST RESULTS ===\n") | |
# Split text into chunks for comparison | |
chunks = re.split(split_pattern, text) | |
chunks = [chunk.strip() for chunk in chunks if chunk.strip()] | |
results.append(f"Text split into {len(chunks)} chunks\n") | |
# Test 1: Per-chunk vs. Full-text tokenization | |
results.append("TEST #1: TOKENIZATION STRATEGIES") | |
# Approach 1: Per-chunk tokenization | |
start_time = time.time() | |
all_phonemes = [] | |
for chunk in chunks: | |
phonemes = tokenizer.phonemize(chunk, lang=language) | |
all_phonemes.append(phonemes) | |
per_chunk_time = time.time() - start_time | |
results.append(f"Per-chunk tokenization: {per_chunk_time:.6f}s") | |
# Approach 2: Single tokenization for entire text | |
start_time = time.time() | |
full_phonemes = tokenizer.phonemize(text, lang=language) | |
full_tokenization_time = time.time() - start_time | |
results.append(f"Full text tokenization: {full_tokenization_time:.6f}s") | |
if full_tokenization_time > 0: | |
results.append(f"Speedup: {per_chunk_time / full_tokenization_time:.2f}x\n") | |
# Test 2: Audio generation strategies | |
results.append("TEST #2: AUDIO GENERATION STRATEGIES") | |
# Approach 1: Generate per chunk | |
start_time = time.time() | |
audio_chunks = [] | |
for p in all_phonemes: | |
if p.strip(): # Skip empty phonemes | |
audio, _ = kokoro.create(p, voice=voice, speed=speed, is_phonemes=True) | |
audio_chunks.append(audio) | |
split_gen_time = time.time() - start_time | |
results.append(f"Generate per chunk: {split_gen_time:.6f}s") | |
# Approach 2: Generate for full text | |
start_time = time.time() | |
audio_full, _ = kokoro.create( | |
full_phonemes, voice=voice, speed=speed, is_phonemes=True | |
) | |
full_gen_time = time.time() - start_time | |
results.append(f"Generate full text: {full_gen_time:.6f}s") | |
if full_gen_time > 0: | |
results.append(f"Speedup: {split_gen_time / full_gen_time:.2f}x\n") | |
# Test 3: Total processing time comparison | |
results.append("TEST #3: TOTAL PROCESSING TIME") | |
total_chunked = per_chunk_time + split_gen_time | |
total_full = full_tokenization_time + full_gen_time | |
results.append(f"Total time (chunked): {total_chunked:.6f}s") | |
results.append(f"Total time (full text): {total_full:.6f}s") | |
if total_full > 0: | |
results.append(f"Overall speedup: {total_chunked / total_full:.2f}x") | |
# Recommendations | |
results.append("\nRECOMMENDATIONS:") | |
if per_chunk_time > full_tokenization_time: | |
results.append("- Tokenize entire text at once instead of per-chunk") | |
if split_gen_time > full_gen_time: | |
results.append("- Generate audio for entire text rather than per-chunk") | |
elif split_gen_time < full_gen_time: | |
results.append("- Keep generating audio in chunks for better performance") | |
return "\n".join(results) | |
# [OLD] Chunking create func | |
def create(text: str, voice: str, language: str, blend_voice_name: str = None, | |
blend_ratio: float = 0.5, split_pattern: str = r"\n+", speed: float = 1.0, | |
output_dir: str = AUDIO_DIR): | |
""" | |
Generate audio using Kokoro-ONNX with added features | |
Args: | |
text: Text to synthesize | |
voice: Primary voice to use | |
language: Language code | |
blend_voice_name: Optional secondary voice for blending | |
blend_ratio: Ratio of primary to secondary voice (0.0-1.0) | |
split_pattern: Pattern to split text into chunks | |
speed: Speech rate | |
output_dir: Directory to save audio files | |
Returns: | |
Tuple of (audio_tuple, phonemes, split_info, timing_info) | |
""" | |
global CURRENT_VOICE | |
# Create output directory if it doesn't exist | |
os.makedirs(output_dir, exist_ok=True) | |
# Update current voice | |
if voice != CURRENT_VOICE and not blend_voice_name: | |
print(f"Voice changed from {CURRENT_VOICE} to {voice}") | |
CURRENT_VOICE = voice | |
# Start total timing | |
start_total_time = time.time() | |
# Split text into chunks | |
chunks = preview_text_splitting(text, split_pattern) | |
split_info = f"Text split into {len(chunks)} chunks using pattern: '{split_pattern}'" | |
print(split_info) | |
# Initialize variables for processing | |
all_audio = [] | |
all_phonemes = [] | |
sample_rate = 24000 # Kokoro's sample rate | |
# Timing metrics | |
phoneme_times = [] | |
generation_times = [] | |
save_times = [] | |
# Process each chunk | |
for i, chunk in enumerate(chunks): | |
# Skip empty chunks | |
if not chunk.strip(): | |
continue | |
# Time phonemization | |
phoneme_start = time.time() | |
phonemes = tokenizer.phonemize(chunk, lang=language) | |
phoneme_time = time.time() - phoneme_start | |
phoneme_times.append(phoneme_time) | |
print(f"Chunk {i+1} phonemized in {phoneme_time:.6f}s") | |
# Save phonemes | |
all_phonemes.append(f"Chunk {i+1}: {phonemes}") | |
# Handle voice blending | |
voice_blend_start = time.time() | |
voice_to_use = voice | |
if blend_voice_name: | |
first_voice = kokoro.get_voice_style(voice) | |
second_voice = kokoro.get_voice_style(blend_voice_name) | |
voice_to_use = np.add(first_voice * blend_ratio, second_voice * (1 - blend_ratio)) | |
print(f"Voices blended in {time.time() - voice_blend_start:.6f}s") | |
# Generate audio | |
gen_start = time.time() | |
audio, sr = kokoro.create(phonemes, voice=voice_to_use, speed=speed, is_phonemes=True) | |
gen_time = time.time() - gen_start | |
generation_times.append(gen_time) | |
print(f"Chunk {i+1} audio generated in {gen_time:.6f}s") | |
# Add to audio list | |
all_audio.append(audio) | |
# Save individual chunk to file | |
save_start = time.time() | |
voice_label = voice.split('_')[1] if isinstance(voice, str) else 'blend' | |
chunk_filename = os.path.join(output_dir, f"chunk_{i+1}_{voice_label}.wav") | |
sf.write(chunk_filename, audio, sr) | |
save_time = time.time() - save_start | |
save_times.append(save_time) | |
print(f"Chunk {i+1} saved to {chunk_filename} in {save_time:.6f}s") | |
# Time to combine chunks | |
combine_start = time.time() | |
if len(all_audio) > 1: | |
audio_data = np.concatenate(all_audio) | |
combine_time = time.time() - combine_start | |
print(f"Combined {len(all_audio)} chunks in {combine_time:.6f}s") | |
else: | |
audio_data = all_audio[0] if all_audio else np.array([]) | |
combine_time = 0 | |
# Time to save combined file | |
save_combined_start = time.time() | |
voice_label = voice.split('_')[1] if isinstance(voice, str) else 'blend' | |
combined_filename = os.path.join(output_dir, f"combined_{voice_label}.wav") | |
sf.write(combined_filename, audio_data, sample_rate) | |
save_combined_time = time.time() - save_combined_start | |
print(f"Combined audio saved to {combined_filename} in {save_combined_time:.6f}s") | |
# Calculate total time | |
total_time = time.time() - start_total_time | |
# Create detailed timing info | |
chunks_count = len(all_audio) | |
timing_lines = [] | |
# Add summary of processing times | |
timing_lines.append(f"Phonemization time: {sum(phoneme_times):.6f}s") | |
timing_lines.append(f"Audio generation time: {sum(generation_times):.6f}s") | |
# Per-chunk timing | |
if chunks_count > 1: | |
timing_lines.append("\nChunk details:") | |
for i in range(chunks_count): | |
timing_lines.append(f" Chunk {i+1}: Phoneme {phoneme_times[i]:.6f}s, Gen {generation_times[i]:.6f}s, Save {save_times[i]:.6f}s") | |
# Combine and save timing | |
if chunks_count > 1: | |
timing_lines.append(f"\nCombine chunks: {combine_time:.6f}s") | |
timing_lines.append(f"Save combined: {save_combined_time:.6f}s") | |
# Total timing | |
timing_lines.append(f"\nTotal processing time: {total_time:.6f}s") | |
# Format timing info for display | |
timing_info = "\n".join(timing_lines) | |
# Combine phonemes | |
phonemes_text = "\n\n".join(all_phonemes) | |
# Update split info | |
if chunks_count > 1: | |
split_info = f"Text was split into {chunks_count} chunks and saved to {output_dir}" | |
else: | |
split_info = f"Text processed as a single chunk and saved to {output_dir}" | |
return [(sample_rate, audio_data), phonemes_text, split_info, timing_info] | |
# Optimized -- over rides paragraph splitting behavior... | |
# def create( | |
# text: str, | |
# voice: str, | |
# language: str, | |
# blend_voice_name: str = None, | |
# blend_ratio: float = 0.5, | |
# split_pattern: str = r"\n+", | |
# speed: float = 1.0, | |
# output_dir: str = AUDIO_DIR, | |
# ): | |
# """ | |
# Generate audio using Kokoro-ONNX with optimized processing | |
# Args: | |
# text: Text to synthesize | |
# voice: Primary voice to use | |
# language: Language code | |
# blend_voice_name: Optional secondary voice for blending | |
# blend_ratio: Ratio of primary to secondary voice (0.0-1.0) | |
# split_pattern: Pattern to split text into chunks | |
# speed: Speech rate | |
# output_dir: Directory to save audio files | |
# Returns: | |
# Tuple of (audio_tuple, phonemes, split_info, timing_info) | |
# """ | |
# global CURRENT_VOICE | |
# # Create output directory if it doesn't exist | |
# os.makedirs(output_dir, exist_ok=True) | |
# # Update current voice | |
# if voice != CURRENT_VOICE and not blend_voice_name: | |
# print(f"Voice changed from {CURRENT_VOICE} to {voice}") | |
# CURRENT_VOICE = voice | |
# # Start total timing | |
# start_total_time = time.time() | |
# # Split text only for display purposes | |
# chunks = preview_text_splitting(text, split_pattern) | |
# split_info = ( | |
# f"Text split into {len(chunks)} chunks using pattern: '{split_pattern}'" | |
# ) | |
# print(split_info) | |
# # Phonemize the entire text at once (optimization #1) | |
# phoneme_start = time.time() | |
# phonemes = tokenizer.phonemize(text, lang=language) | |
# phoneme_time = time.time() - phoneme_start | |
# print(f"Text phonemized in {phoneme_time:.6f}s") | |
# # Handle voice blending | |
# voice_blend_start = time.time() | |
# voice_to_use = voice | |
# if blend_voice_name: | |
# first_voice = kokoro.get_voice_style(voice) | |
# second_voice = kokoro.get_voice_style(blend_voice_name) | |
# voice_to_use = np.add( | |
# first_voice * blend_ratio, second_voice * (1 - blend_ratio) | |
# ) | |
# voice_blend_time = time.time() - voice_blend_start | |
# print(f"Voices blended in {voice_blend_time:.6f}s") | |
# # Generate audio for entire text at once (optimization #2) | |
# gen_start = time.time() | |
# audio, sample_rate = kokoro.create( | |
# phonemes, voice=voice_to_use, speed=speed, is_phonemes=True | |
# ) | |
# gen_time = time.time() - gen_start | |
# print(f"Audio generated in {gen_time:.6f}s") | |
# # Save to file | |
# save_start = time.time() | |
# voice_label = voice.split("_")[1] if isinstance(voice, str) else "blend" | |
# filename = os.path.join(output_dir, f"full_{voice_label}.wav") | |
# sf.write(filename, audio, sample_rate) | |
# save_time = time.time() - save_start | |
# print(f"Audio saved to {filename} in {save_time:.6f}s") | |
# # Calculate total time | |
# total_time = time.time() - start_total_time | |
# # Create timing info | |
# timing_lines = [ | |
# f"Phonemization time: {phoneme_time:.6f}s", | |
# f"Audio generation time: {gen_time:.6f}s", | |
# f"Save time: {save_time:.6f}s", | |
# f"\nTotal processing time: {total_time:.6f}s", | |
# f"\nOptimized approach: Processing entire text at once (2.1x faster)", | |
# ] | |
# timing_info = "\n".join(timing_lines) | |
# # For display, still show the text chunks | |
# chunk_display = [] | |
# for i, chunk in enumerate(chunks): | |
# chunk_display.append(f"Chunk {i + 1}: Text: {chunk[:50]}...") | |
# phonemes_display = ( | |
# "Full text phonemes (first 100 chars):\n" + phonemes[:100] + "..." | |
# ) | |
# return [(sample_rate, audio), phonemes_display, split_info, timing_info] | |
def on_split_pattern_change(pattern_name, custom_pattern): | |
""" | |
Handle changes to the split pattern selection | |
""" | |
if pattern_name == "Custom": | |
return custom_pattern, gr.update(visible=True) | |
else: | |
return SPLIT_PATTERNS[pattern_name], gr.update(visible=False) | |
def preview_splits(text, pattern): | |
""" | |
Preview how text will be split based on the pattern | |
""" | |
chunks = preview_text_splitting(text, pattern) | |
if len(chunks) == 1 and pattern == "$^": | |
return "Text will be processed as a single chunk (no splitting)" | |
result = f"Text will be split into {len(chunks)} chunks:\n\n" | |
for i, chunk in enumerate(chunks): | |
# Truncate very long chunks in the preview | |
display_chunk = chunk[:100] + "..." if len(chunk) > 100 else chunk | |
result += f"Chunk {i + 1}: {display_chunk}\n\n" | |
return result | |
def create_app(): | |
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Lato"), gr.themes.GoogleFont("Roboto"), "system-ui", "sans-serif"])) as ui: | |
# Title | |
gr.Markdown("# Kokoro-ONNX TTS Demo") | |
gr.Markdown("#### Optimized ONNX implementation with Voice Blending") | |
# Input controls | |
with gr.Row(): | |
with gr.Column(scale=1): | |
text_input = gr.TextArea( | |
label="Input Text", | |
rtl=False, | |
value="Hello!\n\nThis is a multi-paragraph test.\nWith multiple lines.\n\nKokoro can split on paragraphs, sentences, or other patterns.", | |
lines=8, | |
) | |
# Information about split patterns | |
with gr.Accordion("About Text Splitting", open=False): | |
gr.Markdown(""" | |
### Understanding Text Splitting | |
The splitting pattern controls how Kokoro breaks your text into manageable chunks for processing. | |
**Common patterns:** | |
- `\\n+`: Split on one or more newlines (paragraphs) | |
- `(?<=[.!?])\\s+`: Split after periods, question marks, and exclamation points (sentences) | |
- `[,;]\\s+`: Split after commas and semicolons | |
- `$^`: Special pattern that won't match anything (processes the entire text as one chunk) | |
**Benefits of splitting:** | |
- Better phrasing and natural pauses | |
- Improved handling of longer texts | |
- More consistent pronunciation across chunks | |
""") | |
# Split Pattern Selection | |
split_pattern_dropdown = gr.Dropdown( | |
label="Split Text Using", | |
value="Paragraphs (one or more newlines)", | |
choices=list(SPLIT_PATTERNS.keys()), | |
info="Select how to split your text into chunks", | |
) | |
custom_pattern_input = gr.Textbox( | |
label="Custom Split Pattern (Regular Expression)", | |
value=r"\n+", | |
visible=False, | |
info="Enter a custom regex pattern for splitting text", | |
) | |
preview_button = gr.Button("Preview Text Splitting") | |
split_preview = gr.Textbox( | |
label="Split Preview", | |
value="Click 'Preview Text Splitting' to see how your text will be divided", | |
lines=5, | |
) | |
with gr.Column(scale=1): | |
# Language selection | |
language_input = gr.Dropdown( | |
label="Language", | |
value="en-us", | |
choices=SUPPORTED_LANGUAGES, | |
info="Select the language for text processing", | |
) | |
# Voice selection | |
voice_input = gr.Dropdown( | |
label="Primary Voice", | |
value="af_sky", | |
choices=sorted(kokoro.get_voices()), | |
info="Select primary voice for synthesis", | |
) | |
# Voice blending | |
with gr.Accordion("Voice Blending (Optional)", open=False): | |
blend_voice_input = gr.Dropdown( | |
label="Secondary Voice for Blending", | |
value=None, | |
choices=[None] + sorted(kokoro.get_voices()), | |
info="Select secondary voice to blend with primary voice", | |
) | |
blend_ratio = gr.Slider( | |
label="Blend Ratio (Primary:Secondary)", | |
minimum=0.0, | |
maximum=1.0, | |
value=0.5, | |
step=0.05, | |
info="0.0 = 100% Secondary, 1.0 = 100% Primary", | |
) | |
gr.Markdown(""" | |
**Voice blending lets you combine characteristics of two voices.** | |
- A 50:50 blend gives equal weight to both voices | |
- Higher values emphasize the primary voice | |
- Lower values emphasize the secondary voice | |
""") | |
# Speed slider | |
speed_input = gr.Slider( | |
label="Speech Speed", | |
minimum=0.5, | |
maximum=1.5, | |
value=1.0, | |
step=0.1, | |
info="Adjust speaking rate", | |
) | |
# Add a testing mode toggle | |
with gr.Accordion("Performance Testing", open=False): | |
test_mode = gr.Checkbox(label="Enable Test Mode", value=False) | |
gr.Markdown(""" | |
### Performance Testing | |
When enabled, clicking "Generate Audio" will run performance tests instead of generating audio. | |
Tests compare different processing approaches to identify the most efficient method. | |
Use this to optimize your implementation based on your specific hardware and text content. | |
""") | |
with gr.Column(scale=1): | |
# Generate button | |
submit_button = gr.Button("Generate Audio", variant="primary") | |
# Outputs | |
audio_output = gr.Audio( | |
label="Generated Audio", format="wav", show_download_button=True | |
) | |
audio_gen_timing_output = gr.Textbox( | |
label="Performance Metrics", lines=12 | |
) | |
phonemes_output = gr.Textbox(label="Phoneme Representation", lines=10) | |
split_info_output = gr.Textbox(label="Processing Information", lines=5) | |
test_results = gr.Textbox( | |
label="Test Results", | |
lines=15, | |
visible=False, # Hidden until test is run | |
) | |
# Handle split pattern change | |
split_pattern_dropdown.change( | |
fn=on_split_pattern_change, | |
inputs=[split_pattern_dropdown, custom_pattern_input], | |
outputs=[custom_pattern_input, custom_pattern_input], | |
) | |
# Preview splitting button | |
preview_button.click( | |
fn=preview_splits, | |
inputs=[text_input, custom_pattern_input], | |
outputs=[split_preview], | |
) | |
# Button click handler | |
def on_generate( | |
text, | |
voice, | |
language, | |
blend_voice, | |
blend_ratio, | |
split_pattern, | |
speed, | |
test_mode, | |
): | |
if test_mode: | |
# Run performance tests | |
results = run_performance_tests( | |
text, voice, language, split_pattern, speed | |
) | |
# Make the results visible | |
return None, None, None, None, gr.update(visible=True, value=results) | |
else: | |
# Regular generation | |
audio_tuple, phonemes, split_info, timing_info = create( | |
text, | |
voice, | |
language, | |
blend_voice_name=blend_voice, | |
blend_ratio=blend_ratio, | |
split_pattern=split_pattern, | |
speed=speed, | |
output_dir=AUDIO_DIR, | |
) | |
# Return results and hide test results | |
return ( | |
audio_tuple, | |
timing_info, | |
phonemes, | |
split_info, | |
gr.update(visible=False), | |
) | |
submit_button.click( | |
fn=on_generate, | |
inputs=[ | |
text_input, | |
voice_input, | |
language_input, | |
blend_voice_input, | |
blend_ratio, | |
custom_pattern_input, | |
speed_input, | |
test_mode, | |
], | |
outputs=[ | |
audio_output, | |
audio_gen_timing_output, | |
phonemes_output, | |
split_info_output, | |
test_results, | |
], | |
) | |
return ui | |
# Create and launch the app | |
ui = create_app() | |
ui.launch( | |
debug=True, | |
server_name="0.0.0.0", # Make accessible externally | |
server_port=7860, # Choose your port | |
share=True, # Set to True if you want a public link | |
) | |