Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import subprocess | |
| import sys | |
| # Fix OMP_NUM_THREADS issue before any imports | |
| os.environ["OMP_NUM_THREADS"] = "4" | |
| # Install dependencies programmatically to avoid conflicts | |
| def setup_dependencies(): | |
| try: | |
| # Check if already installed | |
| if os.path.exists('/tmp/deps_installed'): | |
| return | |
| print("Installing transformers dev version...") | |
| subprocess.check_call([ | |
| sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir", | |
| "git+https://github.com/huggingface/transformers.git" | |
| ]) | |
| # Mark as installed | |
| with open('/tmp/deps_installed', 'w') as f: | |
| f.write('done') | |
| except Exception as e: | |
| print(f"Dependencies setup error: {e}") | |
| # Run setup | |
| setup_dependencies() | |
| import spaces | |
| import gradio as gr | |
| from util import Config, NemoAudioPlayer, KaniModel, Demo | |
| import numpy as np | |
| import torch | |
| # Get HuggingFace token | |
| token_ = os.getenv('HF_TOKEN') | |
| # Model configurations | |
| models_configs = { | |
| 'Base_pretrained_model': Config(), | |
| 'Female_voice': Config( | |
| model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2', | |
| temperature=0.2 | |
| ), | |
| 'Male_voice': Config( | |
| model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1', | |
| temperature=0.2 | |
| ) | |
| } | |
| # Global variables for models (loaded once) | |
| player = NemoAudioPlayer(Config()) | |
| demo_examples = Demo()() | |
| models = {} | |
| for model_name, config in models_configs.items(): | |
| print(f"Loading {model_name}...") | |
| models[model_name] = KaniModel(config, player, token_) | |
| print(f"{model_name} loaded!") | |
| print("All models loaded!") | |
| # def initialize_models(): | |
| # """Initialize models globally to avoid reloading""" | |
| # global models | |
| # # if player is None: | |
| # # print("Initializing NeMo Audio Player...") | |
| # # player = NemoAudioPlayer(Config()) | |
| # # print("NeMo Audio Player initialized!") | |
| # if not models: | |
| # print("Loading TTS models...") | |
| # for model_name, config in models_configs.items(): | |
| # print(f"Loading {model_name}...") | |
| # models[model_name] = KaniModel(config, player, token_) | |
| # print(f"{model_name} loaded!") | |
| # print("All models loaded!") | |
| def generate_speech_gpu(text, model_choice): | |
| """ | |
| Generate speech from text using the selected model on GPU | |
| """ | |
| # Initialize models if not already done | |
| # initialize_models() | |
| if not text.strip(): | |
| return None, "Please enter text for speech generation." | |
| if not model_choice: | |
| return None, "Please select a model." | |
| try: | |
| # Check GPU availability | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # Get selected model | |
| selected_model = models[model_choice] | |
| # Generate audio | |
| print(f"Generating speech with {model_choice}...") | |
| audio, _, time_report = selected_model.run_model(text) | |
| sample_rate = 22050 | |
| print("Speech generation completed!") | |
| return (sample_rate, audio), time_report #, f"β Audio generated successfully using {model_choice} on {device}" | |
| except Exception as e: | |
| print(f"Error during generation: {str(e)}") | |
| return None, f"β Error during generation: {str(e)}" | |
| # def validate_input(text, model_choice): | |
| # """Quick validation without GPU""" | |
| # if not text.strip(): | |
| # return "β οΈ Please enter text for speech generation." | |
| # if not model_choice: | |
| # return "β οΈ Please select a model." | |
| # return f"β Ready to generate with {model_choice}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo: | |
| gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model") | |
| gr.Markdown("Select a model and enter text to generate high-quality speech") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_dropdown = gr.Dropdown( | |
| choices=list(models_configs.keys()), | |
| value=list(models_configs.keys())[0], | |
| label="Select Model", | |
| info="Base - default model, Female - female voice, Male - male voice" | |
| ) | |
| text_input = gr.Textbox( | |
| label="Enter Text", | |
| placeholder="Enter text for speech generation...", | |
| lines=3, | |
| max_lines=10 | |
| ) | |
| generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg") | |
| # Quick validation button (CPU only) | |
| # validate_btn = gr.Button("π Validate Input", variant="secondary") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="numpy" | |
| ) | |
| time_report_output = gr.Textbox( | |
| label="Time Report", | |
| interactive=False, | |
| value="Ready to generate speech", | |
| lines=3 | |
| ) | |
| # GPU generation event | |
| generate_btn.click( | |
| fn=generate_speech_gpu, | |
| inputs=[text_input, model_dropdown], | |
| outputs=[audio_output, time_report_output] | |
| ) | |
| # Demo Examples | |
| gr.Markdown("## π― Demo Examples") | |
| def play_demo(text): | |
| return (22050, demo_examples[text]), 'DEMO' | |
| with gr.Row(): | |
| for text in list(demo_examples.keys())[:4]: | |
| gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output]) | |
| with gr.Row(): | |
| for text in list(demo_examples.keys())[4:8]: | |
| gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output]) | |
| # # CPU validation event | |
| # validate_btn.click( | |
| # fn=validate_input, | |
| # inputs=[text_input, model_dropdown], | |
| # outputs=status_text | |
| # ) | |
| # # Update status on input change | |
| # text_input.change( | |
| # fn=validate_input, | |
| # inputs=[text_input, model_dropdown], | |
| # outputs=status_text | |
| # ) | |
| # Text examples | |
| # gr.Markdown("### π Text Examples:") | |
| # examples = [ | |
| # "Hello! How are you today?", | |
| # "Welcome to the world of artificial intelligence.", | |
| # "This is a demonstration of neural text-to-speech synthesis.", | |
| # "Zero GPU makes high-quality speech generation accessible to everyone!" | |
| # ] | |
| # gr.Examples( | |
| # examples=examples, | |
| # inputs=text_input, | |
| # label="Click on an example to use it" | |
| # ) | |
| # # Information section | |
| # with gr.Accordion("βΉοΈ Model Information", open=False): | |
| # gr.Markdown(""" | |
| # **Available Models:** | |
| # - **Base Model**: Default pre-trained model for general use | |
| # - **Female Voice**: Optimized for female voice characteristics | |
| # - **Male Voice**: Optimized for male voice characteristics | |
| # **Features:** | |
| # - Powered by NVIDIA NeMo Toolkit | |
| # - High-quality 22kHz audio output | |
| # - Zero GPU acceleration for fast inference | |
| # - Support for long text sequences | |
| # """) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |