import gradio as gr import numpy as np import tempfile import os from kittentts import KittenTTS import soundfile as sf # Initialize the TTS model print("Loading KittenTTS model from Hugging Face...") try: tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1") print("✅ KittenTTS model loaded successfully!") except Exception as e: print(f"❌ Error loading model: {e}") print("Make sure the kittentts package is properly installed") raise # Available voices from the model AVAILABLE_VOICES = [ 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' ] # Create friendly voice names mapping VOICE_MAPPING = { "Voice 2 - Male": "expr-voice-2-m", "Voice 2 - Female": "expr-voice-2-f", "Voice 3 - Male": "expr-voice-3-m", "Voice 3 - Female": "expr-voice-3-f", "Voice 4 - Male": "expr-voice-4-m", "Voice 4 - Female": "expr-voice-4-f", "Voice 5 - Male": "expr-voice-5-m", "Voice 5 - Female": "expr-voice-5-f", } print(f"✅ Available voices: {AVAILABLE_VOICES}") MAX_CHARS = 420 # we don't know the exact limit at this point - works experimentally def generate_speech(text, voice_choice): """ Generate speech from text using KittenTTS with voice selection Args: text (str): The text to convert to speech voice_choice (str): The selected voice option Returns: tuple: (sample_rate, audio_array) for Gradio audio component """ if not text.strip(): return None, "Please enter some text to generate speech." # Check text length - KittenTTS nano model has context limitations if len(text) > MAX_CHARS: return None, f"Text too long! Please limit to {MAX_CHARS} characters. Current length: {len(text)} characters." text = text + " ..." # Added because the model cuts off the audio sometimes. try: # Get voice identifier voice_id = None if voice_choice in VOICE_MAPPING: voice_id = VOICE_MAPPING[voice_choice] print(f"Using voice: {voice_choice} ({voice_id})") # Generate audio using KittenTTS if voice_id is not None: # Use specific voice audio = tts_model.generate(text, voice=voice_id) else: # Fall back to default voice audio = tts_model.generate(text) # KittenTTS returns audio at 24kHz sample rate sample_rate = 24000 # Ensure audio is in the right format for Gradio if isinstance(audio, np.ndarray): # Make sure audio is float32 and in the right range audio = audio.astype(np.float32) if len(audio) > 0 and (audio.max() > 1.0 or audio.min() < -1.0): audio = audio / np.max(np.abs(audio)) voice_msg = f" with {voice_choice}" if voice_id is not None else "" char_count = len(text) return (sample_rate, audio), f"Speech generated successfully{voice_msg}! ({char_count} characters)" except Exception as e: error_msg = str(e) print(f"Error details: {e}") # Provide helpful error messages for common issues if "INVALID_ARGUMENT" in error_msg and "Expand" in error_msg: return None, "Text is too long or complex for the model. Please try shorter, simpler text." elif "ONNXRuntimeError" in error_msg: return None, "Model processing error. Try shorter text or simpler punctuation." else: return None, f"Error generating speech: {error_msg}" def create_interface(): """Create the Gradio interface""" with gr.Blocks( title="KittenTTS - High Quality Text-to-Speech", theme=gr.themes.Soft(font=["Arial", "sans-serif"]), ) as demo: gr.Markdown(""" # 🐱 KittenTTS - High Quality Text-to-Speech Generate high-quality speech from text using [KittenTTS](https://huggingface.co/KittenML/kitten-tts-nano-0.1), a lightweight TTS model that works without GPU! Choose from multiple voice options and enter your text to hear the synthesized speech. """) with gr.Row(): with gr.Column(scale=2): # Voice selection voice_dropdown = gr.Dropdown( choices=list(VOICE_MAPPING.keys()), value=list(VOICE_MAPPING.keys())[0], label="🎤 Select Voice", info="Choose between different male and female voices" ) # Text input text_input = gr.Textbox( label="Text to Speech", placeholder=f"Enter text (max {MAX_CHARS} characters for best results)...", lines=3, max_length=MAX_CHARS, show_copy_button=True, info="Keep text short and simple for the nano model" ) # Generate button generate_btn = gr.Button( "🎵 Generate Speech", variant="primary", size="lg" ) # Status message status_msg = gr.Textbox( label="Status", interactive=False, show_label=True ) with gr.Column(scale=1): # Audio output audio_output = gr.Audio( label="Generated Speech", type="numpy", interactive=False ) # Example texts gr.Markdown("### 📝 Example Texts to Try (Short & Simple):") examples = [ ["Hello world! This is KittenTTS.", "Voice 2 - Female"], ["The quick brown fox jumps over the lazy dog.", "Voice 3 - Male"], ["This model works without a GPU.", "Voice 4 - Female"], ["Welcome to KittenTTS!", "Voice 5 - Male"], ["How are you today?", "Voice 2 - Male"], ["The weather is nice today.", "Voice 3 - Female"] ] gr.Examples( examples=examples, inputs=[text_input, voice_dropdown], label="Click on any example to try it out" ) # Event handlers generate_btn.click( fn=generate_speech, inputs=[text_input, voice_dropdown], outputs=[audio_output, status_msg], show_progress=True ) # Also allow Enter key to generate text_input.submit( fn=generate_speech, inputs=[text_input, voice_dropdown], outputs=[audio_output, status_msg], show_progress=True ) # Footer gr.Markdown(""" --- **About KittenTTS Nano:** - Lightweight 15M parameter text-to-speech model - Works without GPU - optimized for efficiency - Multiple voice options (male and female variants) - 24kHz output sample rate - **Best with short texts (under 400 characters)** - Model: [KittenML/kitten-tts-nano-0.1](https://huggingface.co/KittenML/kitten-tts-nano-0.1) - Built by [KittenML](https://github.com/KittenML/KittenTTS) **Usage Tips for Nano Model:** - ✅ Keep text short and simple (about 400 characters) - ✅ Use common words and standard punctuation - ✅ Break long content into shorter sentences - ❌ Avoid very long sentences or complex punctuation - ❌ Avoid technical jargon or unusual words """) return demo # Create and launch the interface if __name__ == "__main__": demo = create_interface() # Launch the app demo.launch( server_name="0.0.0.0", # Allow external connections server_port=7860, # Standard port for HF Spaces share=False, # Don't create a public link (HF Spaces handles this) show_error=True, # Show errors in the interface quiet=False # Show startup logs )