from create_env import setup_dependencies setup_dependencies() import spaces import gradio as gr from util import NemoAudioPlayer, InitModels, load_config, Examples import numpy as np import torch import os # Get HuggingFace token token_ = os.getenv('HF_TOKEN') config = load_config("./model_config.yaml") models_configs = config.models nemo_player_cfg = config.nemo_player examples_cfg = load_config("./examples.yaml") examples_maker = Examples(examples_cfg) examples = examples_maker() player = NemoAudioPlayer(nemo_player_cfg) init_models = InitModels(models_configs, player, token_) models = init_models() @spaces.GPU def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok): """ Generate speech from text using the selected model on GPU """ if not text.strip(): return None, "Please enter text for speech generation." if not model_choice: return None, "Please select a model." try: device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") selected_model = models[model_choice] cfg = models_configs.get(model_choice) speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {} if speaker_display and speaker_map: speaker_id = speaker_map.get(speaker_display) else: speaker_id = None print(f"Generating speech with {model_choice}...") audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok) sample_rate = 22050 print("Speech generation completed!") return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}" except Exception as e: print(f"Error during generation: {str(e)}") return None, f"❌ Error during generation: {str(e)}" # Create Gradio interface with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo: gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model") gr.Markdown("Select a model and enter text to generate emotional speech") with gr.Row(): with gr.Column(scale=1): model_dropdown = gr.Dropdown( choices=list(models_configs.keys()), value=list(models_configs.keys())[0], label="Selected Model", info="Base generates random voices" ) # Speaker selector (shown only if model has speakers) # Pre-populate all available speakers for example table rendering all_speakers = [] for _cfg in models_configs.values(): if _cfg and _cfg.get('speaker_id'): all_speakers.extend(list(_cfg.speaker_id.keys())) all_speakers = sorted(list(set(all_speakers))) speaker_dropdown = gr.Dropdown( choices=all_speakers, value=None, label="Speaker", visible=False, allow_custom_value=True ) text_input = gr.Textbox( label="Text", placeholder="Enter your text ...", lines=3, max_lines=10 ) with gr.Accordion("Settings", open=False): temp = gr.Slider( minimum=0.1, maximum=1.5, value=1.4, step=0.05, label="Temp", ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P", ) rp = gr.Slider( minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty", ) max_tok = gr.Slider( minimum=100, maximum=2000, value=1200, step=100, label="Max Tokens", ) generate_btn = gr.Button("Run", variant="primary", size="lg") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Audio", type="numpy" ) time_report_output = gr.Textbox( label="Time Report", interactive=False, value="Ready to generate speech", lines=3 ) # Update speakers when model changes def update_speakers(model_choice): cfg = models_configs.get(model_choice) speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else [] if speakers: return gr.update(choices=speakers, value=speakers[0], visible=True) else: return gr.update(choices=[], value=None, visible=False) model_dropdown.change( fn=update_speakers, inputs=[model_dropdown], outputs=[speaker_dropdown] ) # Populate speakers on initial page load based on default model demo.load( fn=update_speakers, inputs=[model_dropdown], outputs=[speaker_dropdown] ) # GPU generation event generate_btn.click( fn=generate_speech_gpu, inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok], outputs=[audio_output, time_report_output] ) with gr.Row(): examples = examples gr.Examples( examples=examples, inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok], fn=generate_speech_gpu, outputs=[audio_output, time_report_output], cache_examples=True, ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True )