import os import subprocess import sys # Fix OMP_NUM_THREADS issue before any imports os.environ["OMP_NUM_THREADS"] = "4" # Install dependencies programmatically to avoid conflicts def setup_dependencies(): try: # Check if already installed if os.path.exists('/tmp/deps_installed'): return print("Installing transformers dev version...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir", "git+https://github.com/huggingface/transformers.git" ]) # Mark as installed with open('/tmp/deps_installed', 'w') as f: f.write('done') except Exception as e: print(f"Dependencies setup error: {e}") # Run setup setup_dependencies() import spaces import gradio as gr from util import Config, NemoAudioPlayer, KaniModel import numpy as np import torch # Get HuggingFace token token_ = os.getenv('HF_TOKEN') # Model configurations models_configs = { 'Base_pretrained_model': Config(), 'Female_voice': Config( model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2', temperature=0.2 ), 'Male_voice': Config( model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1', temperature=0.2 ) } # Global variables for models (loaded once) player = None models = {} def initialize_models(): """Initialize models globally to avoid reloading""" global player, models if player is None: print("Initializing NeMo Audio Player...") player = NemoAudioPlayer(Config()) print("NeMo Audio Player initialized!") if not models: print("Loading TTS models...") for model_name, config in models_configs.items(): print(f"Loading {model_name}...") models[model_name] = KaniModel(config, player, token_) print(f"{model_name} loaded!") print("All models loaded!") @spaces.GPU def generate_speech_gpu(text, model_choice): """ Generate speech from text using the selected model on GPU """ # Initialize models if not already done initialize_models() if not text.strip(): return None, "Please enter text for speech generation." if not model_choice: return None, "Please select a model." try: # Check GPU availability device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Get selected model selected_model = models[model_choice] # Generate audio print(f"Generating speech with {model_choice}...") audio, _ = selected_model.run_model(text) # Convert to Gradio format (sample_rate, audio_data) sample_rate = 22050 # Standard sample rate for NeMo print("Speech generation completed!") return (sample_rate, audio), f"✅ Audio generated successfully using {model_choice} on {device}" except Exception as e: print(f"Error during generation: {str(e)}") return None, f"❌ Error during generation: {str(e)}" def validate_input(text, model_choice): """Quick validation without GPU""" if not text.strip(): return "⚠️ Please enter text for speech generation." if not model_choice: return "⚠️ Please select a model." return f"✅ Ready to generate with {model_choice}" # Create Gradio interface with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎤 KaniTTS - Text to Speech with Zero GPU") gr.Markdown("Select a model and enter text to generate high-quality speech") with gr.Row(): with gr.Column(scale=1): model_dropdown = gr.Dropdown( choices=list(models_configs.keys()), value=list(models_configs.keys())[0], label="Select Model", info="Base - default model, Female - female voice, Male - male voice" ) text_input = gr.Textbox( label="Enter Text", placeholder="Enter text for speech generation...", lines=3, max_lines=10 ) generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") # Quick validation button (CPU only) validate_btn = gr.Button("🔍 Validate Input", variant="secondary") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Speech", type="numpy" ) status_text = gr.Textbox( label="Status", interactive=False, value="Ready to generate speech" ) # GPU generation event generate_btn.click( fn=generate_speech_gpu, inputs=[text_input, model_dropdown], outputs=[audio_output, status_text] ) # CPU validation event validate_btn.click( fn=validate_input, inputs=[text_input, model_dropdown], outputs=status_text ) # Update status on input change text_input.change( fn=validate_input, inputs=[text_input, model_dropdown], outputs=status_text ) # Text examples gr.Markdown("### 📝 Text Examples:") examples = [ "Hello! How are you today?", "Welcome to the world of artificial intelligence.", "This is a demonstration of neural text-to-speech synthesis.", "Zero GPU makes high-quality speech generation accessible to everyone!" ] gr.Examples( examples=examples, inputs=text_input, label="Click on an example to use it" ) # Information section with gr.Accordion("ℹ️ Model Information", open=False): gr.Markdown(""" **Available Models:** - **Base Model**: Default pre-trained model for general use - **Female Voice**: Optimized for female voice characteristics - **Male Voice**: Optimized for male voice characteristics **Features:** - Powered by NVIDIA NeMo Toolkit - High-quality 22kHz audio output - Zero GPU acceleration for fast inference - Support for long text sequences """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True )