Spaces:
Running
on
Zero
Running
on
Zero
from create_env import setup_dependencies | |
setup_dependencies() | |
import spaces | |
import gradio as gr | |
from util import NemoAudioPlayer, InitModels, load_config, Examples | |
import numpy as np | |
import torch | |
import os | |
# Get HuggingFace token | |
token_ = os.getenv('HF_TOKEN') | |
config = load_config("./model_config.yaml") | |
models_configs = config.models | |
nemo_player_cfg = config.nemo_player | |
examples_cfg = load_config("./examples.yaml") | |
examples_maker = Examples(examples_cfg) | |
examples = examples_maker() | |
player = NemoAudioPlayer(nemo_player_cfg) | |
init_models = InitModels(models_configs, player, token_) | |
models = init_models() | |
def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok): | |
""" | |
Generate speech from text using the selected model on GPU | |
""" | |
if not text.strip(): | |
return None, "Please enter text for speech generation." | |
if not model_choice: | |
return None, "Please select a model." | |
try: | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {device}") | |
selected_model = models[model_choice] | |
cfg = models_configs.get(model_choice) | |
speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {} | |
if speaker_display and speaker_map: | |
speaker_id = speaker_map.get(speaker_display) | |
else: | |
speaker_id = None | |
print(f"Generating speech with {model_choice}...") | |
audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok) | |
sample_rate = 22050 | |
print("Speech generation completed!") | |
return (sample_rate, audio), time_report #, f"β Audio generated successfully using {model_choice} on {device}" | |
except Exception as e: | |
print(f"Error during generation: {str(e)}") | |
return None, f"β Error during generation: {str(e)}" | |
# Create Gradio interface | |
with gr.Blocks(title="π» KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo: | |
gr.Markdown("# π» KaniTTS: Fast and Expressive Speech Generation Model") | |
gr.Markdown("Select a model and enter text to generate emotional speech") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
model_dropdown = gr.Dropdown( | |
choices=list(models_configs.keys()), | |
value=list(models_configs.keys())[0], | |
label="Selected Model", | |
info="Base generates random voices" | |
) | |
# Speaker selector (shown only if model has speakers) | |
# Pre-populate all available speakers for example table rendering | |
all_speakers = [] | |
for _cfg in models_configs.values(): | |
if _cfg and _cfg.get('speaker_id'): | |
all_speakers.extend(list(_cfg.speaker_id.keys())) | |
all_speakers = sorted(list(set(all_speakers))) | |
speaker_dropdown = gr.Dropdown( | |
choices=all_speakers, | |
value=None, | |
label="Speaker", | |
visible=False, | |
allow_custom_value=True | |
) | |
text_input = gr.Textbox( | |
label="Text", | |
placeholder="Enter your text ...", | |
lines=3, | |
max_lines=10 | |
) | |
with gr.Accordion("Settings", open=False): | |
temp = gr.Slider( | |
minimum=0.1, maximum=1.5, value=1.4, step=0.05, | |
label="Temp", | |
) | |
top_p = gr.Slider( | |
minimum=0.1, maximum=1.0, value=0.95, step=0.05, | |
label="Top P", | |
) | |
rp = gr.Slider( | |
minimum=1.0, maximum=2.0, value=1.1, step=0.05, | |
label="Repetition Penalty", | |
) | |
max_tok = gr.Slider( | |
minimum=100, maximum=2000, value=1200, step=100, | |
label="Max Tokens", | |
) | |
generate_btn = gr.Button("Run", variant="primary", size="lg") | |
with gr.Column(scale=1): | |
audio_output = gr.Audio( | |
label="Generated Audio", | |
type="numpy" | |
) | |
time_report_output = gr.Textbox( | |
label="Time Report", | |
interactive=False, | |
value="Ready to generate speech", | |
lines=3 | |
) | |
# Update speakers when model changes | |
def update_speakers(model_choice): | |
cfg = models_configs.get(model_choice) | |
speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else [] | |
if speakers: | |
return gr.update(choices=speakers, value=speakers[0], visible=True) | |
else: | |
return gr.update(choices=[], value=None, visible=False) | |
model_dropdown.change( | |
fn=update_speakers, | |
inputs=[model_dropdown], | |
outputs=[speaker_dropdown] | |
) | |
# Populate speakers on initial page load based on default model | |
demo.load( | |
fn=update_speakers, | |
inputs=[model_dropdown], | |
outputs=[speaker_dropdown] | |
) | |
# GPU generation event | |
generate_btn.click( | |
fn=generate_speech_gpu, | |
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok], | |
outputs=[audio_output, time_report_output] | |
) | |
with gr.Row(): | |
examples = examples | |
gr.Examples( | |
examples=examples, | |
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok], | |
fn=generate_speech_gpu, | |
outputs=[audio_output, time_report_output], | |
cache_examples=True, | |
) | |
if __name__ == "__main__": | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
show_error=True | |
) |