Spaces:
Sleeping
Sleeping
File size: 8,373 Bytes
df30b4e c70dcc1 509ede7 c70dcc1 22a2bf5 509ede7 df30b4e 509ede7 df30b4e 509ede7 df30b4e a3ece9e df30b4e c70dcc1 509ede7 df30b4e c70dcc1 509ede7 df30b4e c70dcc1 df30b4e c70dcc1 a3ece9e df30b4e a3ece9e c70dcc1 a3ece9e df30b4e 748a421 df30b4e 509ede7 df30b4e 509ede7 c70dcc1 509ede7 df30b4e 22a2bf5 df30b4e 22a2bf5 a3ece9e df30b4e a3ece9e df30b4e a3ece9e df30b4e 509ede7 df30b4e 509ede7 df30b4e 509ede7 df30b4e a3ece9e 509ede7 df30b4e a3ece9e df30b4e a3ece9e df30b4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
import gradio as gr
import numpy as np
import tempfile
import os
from kittentts import KittenTTS
import soundfile as sf
# Initialize the TTS model
print("Loading KittenTTS model from Hugging Face...")
try:
tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1")
print("β
KittenTTS model loaded successfully!")
except Exception as e:
print(f"β Error loading model: {e}")
print("Make sure the kittentts package is properly installed")
raise
# Available voices from the model
AVAILABLE_VOICES = [
'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
]
# Create friendly voice names mapping
VOICE_MAPPING = {
"Voice 2 - Male": "expr-voice-2-m",
"Voice 2 - Female": "expr-voice-2-f",
"Voice 3 - Male": "expr-voice-3-m",
"Voice 3 - Female": "expr-voice-3-f",
"Voice 4 - Male": "expr-voice-4-m",
"Voice 4 - Female": "expr-voice-4-f",
"Voice 5 - Male": "expr-voice-5-m",
"Voice 5 - Female": "expr-voice-5-f",
}
print(f"β
Available voices: {AVAILABLE_VOICES}")
MAX_CHARS = 420 # we don't know the exact limit at this point - works experimentally
def generate_speech(text, voice_choice):
"""
Generate speech from text using KittenTTS with voice selection
Args:
text (str): The text to convert to speech
voice_choice (str): The selected voice option
Returns:
tuple: (sample_rate, audio_array) for Gradio audio component
"""
if not text.strip():
return None, "Please enter some text to generate speech."
# Check text length - KittenTTS nano model has context limitations
if len(text) > MAX_CHARS:
return None, f"Text too long! Please limit to {MAX_CHARS} characters. Current length: {len(text)} characters."
text = text + " ..." # Added because the model cuts off the audio sometimes.
try:
# Get voice identifier
voice_id = None
if voice_choice in VOICE_MAPPING:
voice_id = VOICE_MAPPING[voice_choice]
print(f"Using voice: {voice_choice} ({voice_id})")
# Generate audio using KittenTTS
if voice_id is not None:
# Use specific voice
audio = tts_model.generate(text, voice=voice_id)
else:
# Fall back to default voice
audio = tts_model.generate(text)
# KittenTTS returns audio at 24kHz sample rate
sample_rate = 24000
# Ensure audio is in the right format for Gradio
if isinstance(audio, np.ndarray):
# Make sure audio is float32 and in the right range
audio = audio.astype(np.float32)
if len(audio) > 0 and (audio.max() > 1.0 or audio.min() < -1.0):
audio = audio / np.max(np.abs(audio))
voice_msg = f" with {voice_choice}" if voice_id is not None else ""
char_count = len(text)
return (sample_rate, audio), f"Speech generated successfully{voice_msg}! ({char_count} characters)"
except Exception as e:
error_msg = str(e)
print(f"Error details: {e}")
# Provide helpful error messages for common issues
if "INVALID_ARGUMENT" in error_msg and "Expand" in error_msg:
return None, "Text is too long or complex for the model. Please try shorter, simpler text."
elif "ONNXRuntimeError" in error_msg:
return None, "Model processing error. Try shorter text or simpler punctuation."
else:
return None, f"Error generating speech: {error_msg}"
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(
title="KittenTTS - High Quality Text-to-Speech",
theme=gr.themes.Soft(font=["Arial", "sans-serif"]),
) as demo:
gr.Markdown("""
# π± KittenTTS - High Quality Text-to-Speech
Generate high-quality speech from text using [KittenTTS](https://huggingface.co/KittenML/kitten-tts-nano-0.1),
a lightweight TTS model that works without GPU!
Choose from multiple voice options and enter your text to hear the synthesized speech.
""")
with gr.Row():
with gr.Column(scale=2):
# Voice selection
voice_dropdown = gr.Dropdown(
choices=list(VOICE_MAPPING.keys()),
value=list(VOICE_MAPPING.keys())[0],
label="π€ Select Voice",
info="Choose between different male and female voices"
)
# Text input
text_input = gr.Textbox(
label="Text to Speech",
placeholder=f"Enter text (max {MAX_CHARS} characters for best results)...",
lines=3,
max_length=MAX_CHARS,
show_copy_button=True,
info="Keep text short and simple for the nano model"
)
# Generate button
generate_btn = gr.Button(
"π΅ Generate Speech",
variant="primary",
size="lg"
)
# Status message
status_msg = gr.Textbox(
label="Status",
interactive=False,
show_label=True
)
with gr.Column(scale=1):
# Audio output
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
interactive=False
)
# Example texts
gr.Markdown("### π Example Texts to Try (Short & Simple):")
examples = [
["Hello world! This is KittenTTS.", "Voice 2 - Female"],
["The quick brown fox jumps over the lazy dog.", "Voice 3 - Male"],
["This model works without a GPU.", "Voice 4 - Female"],
["Welcome to KittenTTS!", "Voice 5 - Male"],
["How are you today?", "Voice 2 - Male"],
["The weather is nice today.", "Voice 3 - Female"]
]
gr.Examples(
examples=examples,
inputs=[text_input, voice_dropdown],
label="Click on any example to try it out"
)
# Event handlers
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_dropdown],
outputs=[audio_output, status_msg],
show_progress=True
)
# Also allow Enter key to generate
text_input.submit(
fn=generate_speech,
inputs=[text_input, voice_dropdown],
outputs=[audio_output, status_msg],
show_progress=True
)
# Footer
gr.Markdown("""
---
**About KittenTTS Nano:**
- Lightweight 15M parameter text-to-speech model
- Works without GPU - optimized for efficiency
- Multiple voice options (male and female variants)
- 24kHz output sample rate
- **Best with short texts (under 400 characters)**
- Model: [KittenML/kitten-tts-nano-0.1](https://huggingface.co/KittenML/kitten-tts-nano-0.1)
- Built by [KittenML](https://github.com/KittenML/KittenTTS)
**Usage Tips for Nano Model:**
- β
Keep text short and simple (about 400 characters)
- β
Use common words and standard punctuation
- β
Break long content into shorter sentences
- β Avoid very long sentences or complex punctuation
- β Avoid technical jargon or unusual words
""")
return demo
# Create and launch the interface
if __name__ == "__main__":
demo = create_interface()
# Launch the app
demo.launch(
server_name="0.0.0.0", # Allow external connections
server_port=7860, # Standard port for HF Spaces
share=False, # Don't create a public link (HF Spaces handles this)
show_error=True, # Show errors in the interface
quiet=False # Show startup logs
) |