File size: 4,971 Bytes
fd8eada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aed6b70
fd8eada
 
aed6b70
fd8eada
 
 
 
3ab6181
fd8eada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aed6b70
3ab6181
fd8eada
 
 
 
 
 
 
 
 
 
 
3ab6181
 
fd8eada
 
 
 
 
 
aed6b70
3ab6181
aed6b70
fd8eada
 
a71a89a
 
aed6b70
532394f
fd8eada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aed6b70
fd8eada
 
 
 
 
 
aed6b70
fd8eada
 
 
 
4da9e15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import gradio as gr
import tempfile
import uuid
import os
from kittentts import KittenTTS
import soundfile as sf

# Initialize the TTS model
model = KittenTTS("KittenML/kitten-tts-nano-0.1")

def generate_speech(text, voice, speed):
    """
    Generate speech from text using KittenTTS
    
    Args:
        text (str): Text to convert to speech
        voice (str): Voice to use for generation
        speed (float): Speed of speech generation
    
    Returns:
        str: Path to generated audio file
    """
    if not text.strip():
        return None, "Please enter some text to generate speech."
    
    try:
        # Generate audio
        audio = model.generate(text, voice=voice, speed=speed)
        
        # Create temporary file with UUID
        temp_dir = tempfile.gettempdir()
        unique_filename = f"kitten_tts_{uuid.uuid4()}.wav"
        output_path = os.path.join(temp_dir, unique_filename)
        
        # Save audio file
        sf.write(output_path, audio, 24000)
        
        return output_path
        
    except Exception as e:
        return None

def get_available_voices():
    """Get list of available voices from the model"""
    try:
        voices = model.available_voices
        return voices if voices else ["expr-voice-5-m"]  # Default voice as fallback
    except:
        return ["expr-voice-5-m"]  # Default voice as fallback

# Get available voices
available_voices = get_available_voices()

# Create Gradio interface
with gr.Blocks(title="KittenTTS - Text to Speech", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🐱 KittenTTS - Text to Speech Generator")
    gr.Markdown("Convert your text to high-quality speech using KittenTTS nano model!")
    
    with gr.Row():
        with gr.Column(scale=2):
            # Input components
            text_input = gr.Textbox(
                label="Text to Convert",
                placeholder="Enter the text you want to convert to speech...",
                lines=4,
                max_lines=10
            )
            
            with gr.Row():
                voice_dropdown = gr.Dropdown(
                    choices=available_voices,
                    value=available_voices[0] if available_voices else "expr-voice-5-m",
                    label="Voice Selection",
                    info="Choose the voice for speech generation"
                )
                
                speed_slider = gr.Slider(
                    minimum=0.5,
                    maximum=2.0,
                    step=0.01,
                    value=1.25,
                    label="Speech Speed",
                    info="Adjust the speed of speech (0.5x to 2.0x)"
                )
            
            generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
            
        with gr.Column(scale=1):
            # Output components
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath",
                interactive=False,
                autoplay=True
            )
    
    # Example inputs
    gr.Markdown("## πŸ“ Example Texts")
    examples = gr.Examples(
        examples=[
            ["Hello! This is a test of the KittenTTS model.", available_voices[2] if available_voices else "expr-voice-5-m", 1.25],
            ["The quick brown fox jumps over the lazy dog.", available_voices[1] if available_voices else "expr-voice-5-m", 1.5],
            ["Welcome to the world of high-quality text-to-speech synthesis!", available_voices[5] if available_voices else "expr-voice-5-m", 1],
        ],
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output],
        fn=generate_speech,
        label="Click on an example to try it out",
        cache_examples = "lazy"
    )
    
    # Model information
    with gr.Accordion("ℹ️ Model Information", open=False):
        gr.Markdown("""
        **Model:** KittenML/kitten-tts-nano-0.1
        
        **Features:**
        - High-quality text-to-speech synthesis
        - Works without GPU acceleration
        - Multiple voice options
        - Adjustable speech speed
        - 24kHz audio output
        
        **Usage:**
        1. Enter your text in the text box
        2. Select a voice from the dropdown
        3. Adjust the speech speed if needed
        4. Click "Generate Speech" to create audio
        
        Generated files are saved in temporary directory with unique UUID filenames.
        """)
    
    # Event handlers
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output]
    )
    
    # Auto-generate on Enter key (optional)
    text_input.submit(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, speed_slider],
        outputs=[audio_output]
    )

# Launch the app
if __name__ == "__main__":
    app.queue(default_concurrency_limit=100).launch()