import os
import subprocess
import sys

# Fix OMP_NUM_THREADS issue before any imports
os.environ["OMP_NUM_THREADS"] = "4"

# Install dependencies programmatically to avoid conflicts
def setup_dependencies():
    try:
        # Check if already installed
        if os.path.exists('/tmp/deps_installed'):
            return
            
        print("Installing transformers dev version...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
            "git+https://github.com/huggingface/transformers.git"
        ])
        
        # Mark as installed
        with open('/tmp/deps_installed', 'w') as f:
            f.write('done')
            
    except Exception as e:
        print(f"Dependencies setup error: {e}")

# Run setup
setup_dependencies()

import spaces
import gradio as gr
from util import Config, NemoAudioPlayer, KaniModel
import numpy as np
import torch

# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')

# Model configurations
models_configs = {
    'Base_pretrained_model': Config(),
    'Female_voice': Config(
        model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2',
        temperature=0.2
    ),
    'Male_voice': Config(
        model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1',
        temperature=0.2
    )
}

# Global variables for models (loaded once)
player = None
models = {}

def initialize_models():
    """Initialize models globally to avoid reloading"""
    global player, models
    
    if player is None:
        print("Initializing NeMo Audio Player...")
        player = NemoAudioPlayer(Config())
        print("NeMo Audio Player initialized!")
    
    if not models:
        print("Loading TTS models...")
        for model_name, config in models_configs.items():
            print(f"Loading {model_name}...")
            models[model_name] = KaniModel(config, player, token_)
            print(f"{model_name} loaded!")
        print("All models loaded!")

@spaces.GPU
def generate_speech_gpu(text, model_choice):
    """
    Generate speech from text using the selected model on GPU
    """
    # Initialize models if not already done
    initialize_models()
    
    if not text.strip():
        return None, "Please enter text for speech generation."
    
    if not model_choice:
        return None, "Please select a model."
    
    try:
        # Check GPU availability
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        # Get selected model
        selected_model = models[model_choice]
        
        # Generate audio
        print(f"Generating speech with {model_choice}...")
        audio, _ = selected_model.run_model(text)
        
        # Convert to Gradio format (sample_rate, audio_data)
        sample_rate = 22050  # Standard sample rate for NeMo
        print("Speech generation completed!")
        
        return (sample_rate, audio), f"✅ Audio generated successfully using {model_choice} on {device}"
        
    except Exception as e:
        print(f"Error during generation: {str(e)}")
        return None, f"❌ Error during generation: {str(e)}"

def validate_input(text, model_choice):
    """Quick validation without GPU"""
    if not text.strip():
        return "⚠️ Please enter text for speech generation."
    if not model_choice:
        return "⚠️ Please select a model."
    return f"✅ Ready to generate with {model_choice}"

# Create Gradio interface
with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎤 KaniTTS - Text to Speech with Zero GPU")
    gr.Markdown("Select a model and enter text to generate high-quality speech")
    
    with gr.Row():
        with gr.Column(scale=1):
            model_dropdown = gr.Dropdown(
                choices=list(models_configs.keys()),
                value=list(models_configs.keys())[0],
                label="Select Model",
                info="Base - default model, Female - female voice, Male - male voice"
            )
            
            text_input = gr.Textbox(
                label="Enter Text",
                placeholder="Enter text for speech generation...",
                lines=3,
                max_lines=10
            )
            
            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
            
            # Quick validation button (CPU only)
            validate_btn = gr.Button("🔍 Validate Input", variant="secondary")
            
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="Generated Speech",
                type="numpy"
            )
            
            status_text = gr.Textbox(
                label="Status",
                interactive=False,
                value="Ready to generate speech"
            )
    
    # GPU generation event
    generate_btn.click(
        fn=generate_speech_gpu,
        inputs=[text_input, model_dropdown],
        outputs=[audio_output, status_text]
    )
    
    # CPU validation event
    validate_btn.click(
        fn=validate_input,
        inputs=[text_input, model_dropdown],
        outputs=status_text
    )
    
    # Update status on input change
    text_input.change(
        fn=validate_input,
        inputs=[text_input, model_dropdown],
        outputs=status_text
    )
    
    # Text examples
    gr.Markdown("### 📝 Text Examples:")
    examples = [
        "Hello! How are you today?",
        "Welcome to the world of artificial intelligence.",
        "This is a demonstration of neural text-to-speech synthesis.",
        "Zero GPU makes high-quality speech generation accessible to everyone!"
    ]
    
    gr.Examples(
        examples=examples,
        inputs=text_input,
        label="Click on an example to use it"
    )
    
    # Information section
    with gr.Accordion("ℹ️ Model Information", open=False):
        gr.Markdown("""
        **Available Models:**
        - **Base Model**: Default pre-trained model for general use
        - **Female Voice**: Optimized for female voice characteristics
        - **Male Voice**: Optimized for male voice characteristics
        
        **Features:**
        - Powered by NVIDIA NeMo Toolkit
        - High-quality 22kHz audio output
        - Zero GPU acceleration for fast inference
        - Support for long text sequences
        """)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )