import os import subprocess import sys # Fix OMP_NUM_THREADS issue before any imports os.environ["OMP_NUM_THREADS"] = "4" # Install dependencies programmatically to avoid conflicts def setup_dependencies(): try: # Check if already installed if os.path.exists('/tmp/deps_installed'): return print("Installing transformers dev version...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir", "git+https://github.com/huggingface/transformers.git" ]) # Mark as installed with open('/tmp/deps_installed', 'w') as f: f.write('done') except Exception as e: print(f"Dependencies setup error: {e}") # Run setup setup_dependencies() import spaces import gradio as gr from util import Config, NemoAudioPlayer, KaniModel, Demo import numpy as np import torch # Get HuggingFace token token_ = os.getenv('HF_TOKEN') # Model configurations models_configs = { 'Base_pretrained_model': Config(), 'Female_voice': Config( model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2', temperature=0.2 ), 'Male_voice': Config( model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1', temperature=0.2 ) } # Global variables for models (loaded once) player = NemoAudioPlayer(Config()) demo_examples = Demo()() models = {} for model_name, config in models_configs.items(): print(f"Loading {model_name}...") models[model_name] = KaniModel(config, player, token_) print(f"{model_name} loaded!") print("All models loaded!") # def initialize_models(): # """Initialize models globally to avoid reloading""" # global models # # if player is None: # # print("Initializing NeMo Audio Player...") # # player = NemoAudioPlayer(Config()) # # print("NeMo Audio Player initialized!") # if not models: # print("Loading TTS models...") # for model_name, config in models_configs.items(): # print(f"Loading {model_name}...") # models[model_name] = KaniModel(config, player, token_) # print(f"{model_name} loaded!") # print("All models loaded!") @spaces.GPU def generate_speech_gpu(text, model_choice): """ Generate speech from text using the selected model on GPU """ # Initialize models if not already done # initialize_models() if not text.strip(): return None, "Please enter text for speech generation." if not model_choice: return None, "Please select a model." try: # Check GPU availability device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Get selected model selected_model = models[model_choice] # Generate audio print(f"Generating speech with {model_choice}...") audio, _, time_report = selected_model.run_model(text) sample_rate = 22050 print("Speech generation completed!") return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}" except Exception as e: print(f"Error during generation: {str(e)}") return None, f"❌ Error during generation: {str(e)}" # def validate_input(text, model_choice): # """Quick validation without GPU""" # if not text.strip(): # return "⚠️ Please enter text for speech generation." # if not model_choice: # return "⚠️ Please select a model." # return f"✅ Ready to generate with {model_choice}" # Create Gradio interface with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo: gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model") gr.Markdown("Select a model and enter text to generate high-quality speech") with gr.Row(): with gr.Column(scale=1): model_dropdown = gr.Dropdown( choices=list(models_configs.keys()), value=list(models_configs.keys())[0], label="Select Model", info="Base - default model, Female - female voice, Male - male voice" ) text_input = gr.Textbox( label="Enter Text", placeholder="Enter text for speech generation...", lines=3, max_lines=10 ) generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") # Quick validation button (CPU only) # validate_btn = gr.Button("🔍 Validate Input", variant="secondary") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Speech", type="numpy" ) time_report_output = gr.Textbox( label="Time Report", interactive=False, value="Ready to generate speech", lines=3 ) # GPU generation event generate_btn.click( fn=generate_speech_gpu, inputs=[text_input, model_dropdown], outputs=[audio_output, time_report_output] ) # Demo Examples gr.Markdown("## 🎯 Demo Examples") def play_demo(text): return (22050, demo_examples[text]), 'DEMO' with gr.Row(): for text in list(demo_examples.keys())[:4]: gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output]) with gr.Row(): for text in list(demo_examples.keys())[4:8]: gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output]) # # CPU validation event # validate_btn.click( # fn=validate_input, # inputs=[text_input, model_dropdown], # outputs=status_text # ) # # Update status on input change # text_input.change( # fn=validate_input, # inputs=[text_input, model_dropdown], # outputs=status_text # ) # Text examples # gr.Markdown("### 📝 Text Examples:") # examples = [ # "Hello! How are you today?", # "Welcome to the world of artificial intelligence.", # "This is a demonstration of neural text-to-speech synthesis.", # "Zero GPU makes high-quality speech generation accessible to everyone!" # ] # gr.Examples( # examples=examples, # inputs=text_input, # label="Click on an example to use it" # ) # # Information section # with gr.Accordion("ℹ️ Model Information", open=False): # gr.Markdown(""" # **Available Models:** # - **Base Model**: Default pre-trained model for general use # - **Female Voice**: Optimized for female voice characteristics # - **Male Voice**: Optimized for male voice characteristics # **Features:** # - Powered by NVIDIA NeMo Toolkit # - High-quality 22kHz audio output # - Zero GPU acceleration for fast inference # - Support for long text sequences # """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True )