Spaces:

KingNish
/

Kitten-TTS

Running

File size: 4,971 Bytes

fd8eada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aed6b70
fd8eada
 
aed6b70
fd8eada
 
 
 
3ab6181
fd8eada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aed6b70
3ab6181
fd8eada
 
 
 
 
 
 
 
 
 
 
3ab6181
 
fd8eada
 
 
 
 
 
aed6b70
3ab6181
aed6b70
fd8eada
 
a71a89a
 
aed6b70
532394f
fd8eada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aed6b70
fd8eada
 
 
 
 
 
aed6b70
fd8eada
 
 
 
4da9e15

import gradio as gr
import tempfile
import uuid
import os
from kittentts import KittenTTS
import soundfile as sf

# Initialize the TTS model
model = KittenTTS("KittenML/kitten-tts-nano-0.1")

def generate_speech(text, voice, speed):
    """
    Generate speech from text using KittenTTS
    
    Args:
        text (str): Text to convert to speech
        voice (str): Voice to use for generation
        speed (float): Speed of speech generation
    
    Returns:
        str: Path to generated audio file
    """
    if not text.strip():
        return None, "Please enter some text to generate speech."
    
    try:
        # Generate audio
        audio = model.generate(text, voice=voice, speed=speed)
        
        # Create temporary file with UUID
        temp_dir = tempfile.gettempdir()
        unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
        output_path = os.path.join(temp_dir, unique_filename)
        
        # Save audio file
        sf.write(output_path, audio, 24000)
        
        return output_path
        
    except Exception as e:
        return None

def get_available_voices():
    """Get list of available voices from the model"""
    try:
        voices = model.available_voices
        return voices if voices else ["expr-voice-5-m"]  # Default voice as fallback
    except:
        return ["expr-voice-5-m"]  # Default voice as fallback

# Get available voices
available_voices = get_available_voices()

# Create Gradio interface
with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
    gr.Markdown("Convert your text to high-quality speech using KittenTTS nano model!")
    
    with gr.Row():
        with gr.Column(scale=2):
            # Input components
            text_input = gr.Textbox(
                label="Text to Convert",
                placeholder="Enter the text you want to convert to speech...",
                lines=4,
                max_lines=10
            )
            
            with gr.Row():
                voice_dropdown = gr.Dropdown(
                    choices=available_voices,
                    value=available_voices[0] if available_voices else "expr-voice-5-m",
                    label="Voice Selection",
                    info="Choose the voice for speech generation"
                )
                
                speed_slider = gr.Slider(
                    minimum=0.5,
                    maximum=2.0,
                    step=0.01,
                    value=1.25,
                    label="Speech Speed",
                    info="Adjust the speed of speech (0.5x to 2.0x)"
                )
            
            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
            
        with gr.Column(scale=1):
            # Output components
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath",
                interactive=False,
                autoplay=True
            )
    
    # Example inputs
    gr.Markdown("## 📝 Example Texts")
    examples = gr.Examples(
        examples=[
            ["Hello! This is a test of the KittenTTS model.", available_voices[2] if available_voices else "expr-voice-5-m", 1.25],
            ["The quick brown fox jumps over the lazy dog.", available_voices[1] if available_voices else "expr-voice-5-m", 1.5],
            ["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[5] if available_voices else "expr-voice-5-m", 1],
        ],
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output],
        fn=generate_speech,
        label="Click on an example to try it out",
        cache_examples = "lazy"
    )
    
    # Model information
    with gr.Accordion("ℹ️ Model Information", open=False):
        gr.Markdown("""
        **Model:** KittenML/kitten-tts-nano-0.1
        
        **Features:**
        - High-quality text-to-speech synthesis
        - Works without GPU acceleration
        - Multiple voice options
        - Adjustable speech speed
        - 24kHz audio output
        
        **Usage:**
        1. Enter your text in the text box
        2. Select a voice from the dropdown
        3. Adjust the speech speed if needed
        4. Click "Generate Speech" to create audio
        
        Generated files are saved in temporary directory with unique UUID filenames.
        """)
    
    # Event handlers
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output]
    )
    
    # Auto-generate on Enter key (optional)
    text_input.submit(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output]
    )

# Launch the app
if __name__ == "__main__":
    app.queue(default_concurrency_limit=100).launch()