Spaces:
Running
on
Zero
Running
on
Zero
| from create_env import setup_dependencies | |
| setup_dependencies() | |
| import spaces | |
| import gradio as gr | |
| from util import NemoAudioPlayer, InitModels, load_config, Examples | |
| import numpy as np | |
| import torch | |
| import os | |
| # Get HuggingFace token | |
| token_ = os.getenv('HF_TOKEN') | |
| config = load_config("./model_config.yaml") | |
| models_configs = config.models | |
| nemo_player_cfg = config.nemo_player | |
| examples_cfg = load_config("./examples.yaml") | |
| examples_maker = Examples(examples_cfg) | |
| examples = examples_maker() | |
| player = NemoAudioPlayer(nemo_player_cfg) | |
| init_models = InitModels(models_configs, player, token_) | |
| models = init_models() | |
| def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok): | |
| """ | |
| Generate speech from text using the selected model on GPU | |
| """ | |
| if not text.strip(): | |
| return None, "Please enter text for speech generation." | |
| if not model_choice: | |
| return None, "Please select a model." | |
| try: | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| selected_model = models[model_choice] | |
| cfg = models_configs.get(model_choice) | |
| speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {} | |
| if speaker_display and speaker_map: | |
| speaker_id = speaker_map.get(speaker_display) | |
| else: | |
| speaker_id = None | |
| print(f"Generating speech with {model_choice}...") | |
| audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok) | |
| sample_rate = 22050 | |
| print("Speech generation completed!") | |
| return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}" | |
| except Exception as e: | |
| print(f"Error during generation: {str(e)}") | |
| return None, f"❌ Error during generation: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Ocean()) as demo: | |
| gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model") | |
| gr.Markdown("Select a model and enter text to generate emotional speech") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_dropdown = gr.Dropdown( | |
| choices=list(models_configs.keys()), | |
| value=list(models_configs.keys())[0], | |
| label="Selected Model", | |
| info="Base generates random voices" | |
| ) | |
| # Speaker selector (shown only if model has speakers) | |
| # Pre-populate all available speakers for example table rendering | |
| all_speakers = [] | |
| for _cfg in models_configs.values(): | |
| if _cfg and _cfg.get('speaker_id'): | |
| all_speakers.extend(list(_cfg.speaker_id.keys())) | |
| all_speakers = sorted(list(set(all_speakers))) | |
| speaker_dropdown = gr.Dropdown( | |
| choices=all_speakers, | |
| value=None, | |
| label="Speaker", | |
| visible=False, | |
| allow_custom_value=True | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text", | |
| placeholder="Enter your text ...", | |
| lines=3, | |
| max_lines=10 | |
| ) | |
| with gr.Accordion("Settings", open=False): | |
| temp = gr.Slider( | |
| minimum=0.1, maximum=1.5, value=1.4, step=0.05, | |
| label="Temp", | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, maximum=1.0, value=0.95, step=0.05, | |
| label="Top P", | |
| ) | |
| rp = gr.Slider( | |
| minimum=1.0, maximum=2.0, value=1.1, step=0.05, | |
| label="Repetition Penalty", | |
| ) | |
| max_tok = gr.Slider( | |
| minimum=100, maximum=2000, value=1200, step=100, | |
| label="Max Tokens", | |
| ) | |
| generate_btn = gr.Button("Run", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| type="numpy" | |
| ) | |
| time_report_output = gr.Textbox( | |
| label="Time Report", | |
| interactive=False, | |
| value="Ready to generate speech", | |
| lines=3 | |
| ) | |
| # Update speakers when model changes | |
| def update_speakers(model_choice): | |
| cfg = models_configs.get(model_choice) | |
| speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else [] | |
| if speakers: | |
| return gr.update(choices=speakers, value=speakers[0], visible=True) | |
| else: | |
| return gr.update(choices=[], value=None, visible=False) | |
| model_dropdown.change( | |
| fn=update_speakers, | |
| inputs=[model_dropdown], | |
| outputs=[speaker_dropdown] | |
| ) | |
| # Populate speakers on initial page load based on default model | |
| demo.load( | |
| fn=update_speakers, | |
| inputs=[model_dropdown], | |
| outputs=[speaker_dropdown] | |
| ) | |
| # GPU generation event | |
| generate_btn.click( | |
| fn=generate_speech_gpu, | |
| inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok], | |
| outputs=[audio_output, time_report_output] | |
| ) | |
| with gr.Row(): | |
| examples = examples | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok], | |
| fn=generate_speech_gpu, | |
| outputs=[audio_output, time_report_output], | |
| cache_examples=True, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |