import gradio as gr
import numpy as np
import tempfile
import os
from kittentts import KittenTTS
import soundfile as sf

# Initialize the TTS model
print("Loading KittenTTS model from Hugging Face...")
try:
    tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1")
    print("✅ KittenTTS model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Make sure the kittentts package is properly installed")
    raise

# Available voices from the model
AVAILABLE_VOICES = [
    'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
    'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
]

# Create friendly voice names mapping
VOICE_MAPPING = {
    "Voice 2 - Male": "expr-voice-2-m",
    "Voice 2 - Female": "expr-voice-2-f",
    "Voice 3 - Male": "expr-voice-3-m",
    "Voice 3 - Female": "expr-voice-3-f",
    "Voice 4 - Male": "expr-voice-4-m",
    "Voice 4 - Female": "expr-voice-4-f",
    "Voice 5 - Male": "expr-voice-5-m",
    "Voice 5 - Female": "expr-voice-5-f",
}

print(f"✅ Available voices: {AVAILABLE_VOICES}")

MAX_CHARS = 420  # we don't know the exact limit at this point - works experimentally

def generate_speech(text, voice_choice):
    """
    Generate speech from text using KittenTTS with voice selection
    
    Args:
        text (str): The text to convert to speech
        voice_choice (str): The selected voice option
        
    Returns:
        tuple: (sample_rate, audio_array) for Gradio audio component
    """
    if not text.strip():
        return None, "Please enter some text to generate speech."
    
    # Check text length - KittenTTS nano model has context limitations
    if len(text) > MAX_CHARS:
        return None, f"Text too long! Please limit to {MAX_CHARS} characters. Current length: {len(text)} characters."
    
    text = text + " ..." # Added because the model cuts off the audio sometimes.
    
    try:
        # Get voice identifier
        voice_id = None
        if voice_choice in VOICE_MAPPING:
            voice_id = VOICE_MAPPING[voice_choice]
            print(f"Using voice: {voice_choice} ({voice_id})")
        
        # Generate audio using KittenTTS
        if voice_id is not None:
            # Use specific voice
            audio = tts_model.generate(text, voice=voice_id)
        else:
            # Fall back to default voice
            audio = tts_model.generate(text)
        
        # KittenTTS returns audio at 24kHz sample rate
        sample_rate = 24000
        
        # Ensure audio is in the right format for Gradio
        if isinstance(audio, np.ndarray):
            # Make sure audio is float32 and in the right range
            audio = audio.astype(np.float32)
            if len(audio) > 0 and (audio.max() > 1.0 or audio.min() < -1.0):
                audio = audio / np.max(np.abs(audio))
        
        voice_msg = f" with {voice_choice}" if voice_id is not None else ""
        char_count = len(text)
        return (sample_rate, audio), f"Speech generated successfully{voice_msg}! ({char_count} characters)"
        
    except Exception as e:
        error_msg = str(e)
        print(f"Error details: {e}")
        
        # Provide helpful error messages for common issues
        if "INVALID_ARGUMENT" in error_msg and "Expand" in error_msg:
            return None, "Text is too long or complex for the model. Please try shorter, simpler text."
        elif "ONNXRuntimeError" in error_msg:
            return None, "Model processing error. Try shorter text or simpler punctuation."
        else:
            return None, f"Error generating speech: {error_msg}"

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(
        title="KittenTTS - High Quality Text-to-Speech",
        theme=gr.themes.Soft(font=["Arial", "sans-serif"]),
    ) as demo:
        
        gr.Markdown("""
        # 🐱 KittenTTS - High Quality Text-to-Speech
        
        Generate high-quality speech from text using [KittenTTS](https://huggingface.co/KittenML/kitten-tts-nano-0.1), 
        a lightweight TTS model that works without GPU!
        
        Choose from multiple voice options and enter your text to hear the synthesized speech.
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Voice selection
                voice_dropdown = gr.Dropdown(
                    choices=list(VOICE_MAPPING.keys()),
                    value=list(VOICE_MAPPING.keys())[0],
                    label="🎤 Select Voice",
                    info="Choose between different male and female voices"
                )
                
                # Text input
                text_input = gr.Textbox(
                    label="Text to Speech",
                    placeholder=f"Enter text (max {MAX_CHARS} characters for best results)...",
                    lines=3,
                    max_length=MAX_CHARS,
                    show_copy_button=True,
                    info="Keep text short and simple for the nano model"
                )
                
                # Generate button
                generate_btn = gr.Button(
                    "🎵 Generate Speech", 
                    variant="primary",
                    size="lg"
                )
                
                # Status message
                status_msg = gr.Textbox(
                    label="Status",
                    interactive=False,
                    show_label=True
                )
            
            with gr.Column(scale=1):
                # Audio output
                audio_output = gr.Audio(
                    label="Generated Speech",
                    type="numpy",
                    interactive=False
                )
        
        # Example texts
        gr.Markdown("### 📝 Example Texts to Try (Short & Simple):")
        examples = [
            ["Hello world! This is KittenTTS.", "Voice 2 - Female"],
            ["The quick brown fox jumps over the lazy dog.", "Voice 3 - Male"],  
            ["This model works without a GPU.", "Voice 4 - Female"],
            ["Welcome to KittenTTS!", "Voice 5 - Male"],
            ["How are you today?", "Voice 2 - Male"],
            ["The weather is nice today.", "Voice 3 - Female"]
        ]
        
        gr.Examples(
            examples=examples,
            inputs=[text_input, voice_dropdown],
            label="Click on any example to try it out"
        )
        
        # Event handlers
        generate_btn.click(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_msg],
            show_progress=True
        )
        
        # Also allow Enter key to generate
        text_input.submit(
            fn=generate_speech,
            inputs=[text_input, voice_dropdown],
            outputs=[audio_output, status_msg],
            show_progress=True
        )
        
        # Footer
        gr.Markdown("""
        ---
        
        **About KittenTTS Nano:**
        - Lightweight 15M parameter text-to-speech model
        - Works without GPU - optimized for efficiency  
        - Multiple voice options (male and female variants)
        - 24kHz output sample rate
        - **Best with short texts (under 400 characters)**
        - Model: [KittenML/kitten-tts-nano-0.1](https://huggingface.co/KittenML/kitten-tts-nano-0.1)
        - Built by [KittenML](https://github.com/KittenML/KittenTTS)
        
        **Usage Tips for Nano Model:**
        - ✅ Keep text short and simple (about 400 characters)
        - ✅ Use common words and standard punctuation
        - ✅ Break long content into shorter sentences
        - ❌ Avoid very long sentences or complex punctuation
        - ❌ Avoid technical jargon or unusual words
        """)
    
    return demo

# Create and launch the interface
if __name__ == "__main__":
    demo = create_interface()
    
    # Launch the app
    demo.launch(
        server_name="0.0.0.0",  # Allow external connections
        server_port=7860,       # Standard port for HF Spaces
        share=False,            # Don't create a public link (HF Spaces handles this)
        show_error=True,        # Show errors in the interface
        quiet=False             # Show startup logs
    )