File size: 8,373 Bytes
df30b4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c70dcc1
 
 
 
 
509ede7
 
 
 
 
 
 
 
 
 
 
 
 
c70dcc1
 
22a2bf5
 
509ede7
df30b4e
509ede7
df30b4e
 
 
509ede7
df30b4e
 
 
 
 
 
 
a3ece9e
 
 
 
 
 
df30b4e
c70dcc1
 
 
 
 
509ede7
df30b4e
c70dcc1
 
 
509ede7
 
 
df30b4e
 
 
 
 
 
 
 
c70dcc1
df30b4e
 
c70dcc1
a3ece9e
 
df30b4e
 
a3ece9e
c70dcc1
a3ece9e
 
 
 
 
 
 
 
df30b4e
 
 
 
 
 
748a421
df30b4e
 
 
 
 
 
 
 
509ede7
df30b4e
 
 
 
509ede7
 
c70dcc1
 
509ede7
 
 
 
df30b4e
 
 
22a2bf5
df30b4e
22a2bf5
a3ece9e
 
df30b4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3ece9e
df30b4e
a3ece9e
 
 
 
 
 
df30b4e
 
 
 
509ede7
df30b4e
 
 
 
 
 
509ede7
df30b4e
 
 
 
 
 
 
509ede7
df30b4e
 
 
 
 
 
 
 
a3ece9e
 
 
509ede7
df30b4e
a3ece9e
df30b4e
 
 
a3ece9e
 
 
 
 
 
df30b4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import gradio as gr
import numpy as np
import tempfile
import os
from kittentts import KittenTTS
import soundfile as sf

# Initialize the TTS model
print("Loading KittenTTS model from Hugging Face...")
try:
    tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1")
    print("βœ… KittenTTS model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Make sure the kittentts package is properly installed")
    raise

# Available voices from the model
AVAILABLE_VOICES = [
    'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
    'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
]

# Create friendly voice names mapping
VOICE_MAPPING = {
    "Voice 2 - Male": "expr-voice-2-m",
    "Voice 2 - Female": "expr-voice-2-f",
    "Voice 3 - Male": "expr-voice-3-m",
    "Voice 3 - Female": "expr-voice-3-f",
    "Voice 4 - Male": "expr-voice-4-m",
    "Voice 4 - Female": "expr-voice-4-f",
    "Voice 5 - Male": "expr-voice-5-m",
    "Voice 5 - Female": "expr-voice-5-f",
}

print(f"βœ… Available voices: {AVAILABLE_VOICES}")

MAX_CHARS = 420  # we don't know the exact limit at this point - works experimentally

def generate_speech(text, voice_choice):
    """
    Generate speech from text using KittenTTS with voice selection
    
    Args:
        text (str): The text to convert to speech
        voice_choice (str): The selected voice option
        
    Returns:
        tuple: (sample_rate, audio_array) for Gradio audio component
    """
    if not text.strip():
        return None, "Please enter some text to generate speech."
    
    # Check text length - KittenTTS nano model has context limitations
    if len(text) > MAX_CHARS:
        return None, f"Text too long! Please limit to {MAX_CHARS} characters. Current length: {len(text)} characters."
    
    text = text + " ..." # Added because the model cuts off the audio sometimes.
    
    try:
        # Get voice identifier
        voice_id = None
        if voice_choice in VOICE_MAPPING:
            voice_id = VOICE_MAPPING[voice_choice]
            print(f"Using voice: {voice_choice} ({voice_id})")
        
        # Generate audio using KittenTTS
        if voice_id is not None:
            # Use specific voice
            audio = tts_model.generate(text, voice=voice_id)
        else:
            # Fall back to default voice
            audio = tts_model.generate(text)
        
        # KittenTTS returns audio at 24kHz sample rate
        sample_rate = 24000
        
        # Ensure audio is in the right format for Gradio
        if isinstance(audio, np.ndarray):
            # Make sure audio is float32 and in the right range
            audio = audio.astype(np.float32)
            if len(audio) > 0 and (audio.max() > 1.0 or audio.min() < -1.0):
                audio = audio / np.max(np.abs(audio))
        
        voice_msg = f" with {voice_choice}" if voice_id is not None else ""
        char_count = len(text)
        return (sample_rate, audio), f"Speech generated successfully{voice_msg}! ({char_count} characters)"
        
    except Exception as e:
        error_msg = str(e)
        print(f"Error details: {e}")
        
        # Provide helpful error messages for common issues
        if "INVALID_ARGUMENT" in error_msg and "Expand" in error_msg:
            return None, "Text is too long or complex for the model. Please try shorter, simpler text."
        elif "ONNXRuntimeError" in error_msg:
            return None, "Model processing error. Try shorter text or simpler punctuation."
        else:
            return None, f"Error generating speech: {error_msg}"

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(
        title="KittenTTS - High Quality Text-to-Speech",
        theme=gr.themes.Soft(font=["Arial", "sans-serif"]),
    ) as demo:
        
        gr.Markdown("""
        # 🐱 KittenTTS - High Quality Text-to-Speech
        
        Generate high-quality speech from text using [KittenTTS](https://huggingface.co/KittenML/kitten-tts-nano-0.1), 
        a lightweight TTS model that works without GPU!
        
        Choose from multiple voice options and enter your text to hear the synthesized speech.
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Voice selection
                voice_dropdown = gr.Dropdown(
                    choices=list(VOICE_MAPPING.keys()),
                    value=list(VOICE_MAPPING.keys())[0],
                    label="🎀 Select Voice",
                    info="Choose between different male and female voices"
                )
                
                # Text input
                text_input = gr.Textbox(
                    label="Text to Speech",
                    placeholder=f"Enter text (max {MAX_CHARS} characters for best results)...",
                    lines=3,
                    max_length=MAX_CHARS,
                    show_copy_button=True,
                    info="Keep text short and simple for the nano model"
                )
                
                # Generate button
                generate_btn = gr.Button(
                    "🎡 Generate Speech", 
                    variant="primary",
                    size="lg"
                )
                
                # Status message
                status_msg = gr.Textbox(
                    label="Status",
                    interactive=False,
                    show_label=True
                )
            
            with gr.Column(scale=1):
                # Audio output
                audio_output = gr.Audio(
                    label="Generated Speech",
                    type="numpy",
                    interactive=False
                )
        
        # Example texts
        gr.Markdown("### πŸ“ Example Texts to Try (Short & Simple):")
        examples = [
            ["Hello world! This is KittenTTS.", "Voice 2 - Female"],
            ["The quick brown fox jumps over the lazy dog.", "Voice 3 - Male"],  
            ["This model works without a GPU.", "Voice 4 - Female"],
            ["Welcome to KittenTTS!", "Voice 5 - Male"],
            ["How are you today?", "Voice 2 - Male"],
            ["The weather is nice today.", "Voice 3 - Female"]
        ]
        
        gr.Examples(
            examples=examples,
            inputs=[text_input, voice_dropdown],
            label="Click on any example to try it out"
        )
        
        # Event handlers
        generate_btn.click(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_msg],
            show_progress=True
        )
        
        # Also allow Enter key to generate
        text_input.submit(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_msg],
            show_progress=True
        )
        
        # Footer
        gr.Markdown("""
        ---
        
        **About KittenTTS Nano:**
        - Lightweight 15M parameter text-to-speech model
        - Works without GPU - optimized for efficiency  
        - Multiple voice options (male and female variants)
        - 24kHz output sample rate
        - **Best with short texts (under 400 characters)**
        - Model: [KittenML/kitten-tts-nano-0.1](https://huggingface.co/KittenML/kitten-tts-nano-0.1)
        - Built by [KittenML](https://github.com/KittenML/KittenTTS)
        
        **Usage Tips for Nano Model:**
        - βœ… Keep text short and simple (about 400 characters)
        - βœ… Use common words and standard punctuation
        - βœ… Break long content into shorter sentences
        - ❌ Avoid very long sentences or complex punctuation
        - ❌ Avoid technical jargon or unusual words
        """)
    
    return demo

# Create and launch the interface
if __name__ == "__main__":
    demo = create_interface()
    
    # Launch the app
    demo.launch(
        server_name="0.0.0.0",  # Allow external connections
        server_port=7860,       # Standard port for HF Spaces
        share=False,            # Don't create a public link (HF Spaces handles this)
        show_error=True,        # Show errors in the interface
        quiet=False             # Show startup logs
    )