KaniTTS / app.py
Den Pavloff
multispeaker, multilang
eb18e14
from create_env import setup_dependencies
setup_dependencies()
import spaces
import gradio as gr
from util import NemoAudioPlayer, InitModels, load_config, Examples
import numpy as np
import torch
import os
# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')
config = load_config("./model_config.yaml")
models_configs = config.models
nemo_player_cfg = config.nemo_player
examples_cfg = load_config("./examples.yaml")
examples_maker = Examples(examples_cfg)
examples = examples_maker()
player = NemoAudioPlayer(nemo_player_cfg)
init_models = InitModels(models_configs, player, token_)
models = init_models()
@spaces.GPU
def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok):
"""
Generate speech from text using the selected model on GPU
"""
if not text.strip():
return None, "Please enter text for speech generation."
if not model_choice:
return None, "Please select a model."
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
selected_model = models[model_choice]
cfg = models_configs.get(model_choice)
speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
if speaker_display and speaker_map:
speaker_id = speaker_map.get(speaker_display)
else:
speaker_id = None
print(f"Generating speech with {model_choice}...")
audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok)
sample_rate = 22050
print("Speech generation completed!")
return (sample_rate, audio), time_report #, f"βœ… Audio generated successfully using {model_choice} on {device}"
except Exception as e:
print(f"Error during generation: {str(e)}")
return None, f"❌ Error during generation: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
gr.Markdown("Select a model and enter text to generate emotional speech")
with gr.Row():
with gr.Column(scale=1):
model_dropdown = gr.Dropdown(
choices=list(models_configs.keys()),
value=list(models_configs.keys())[0],
label="Selected Model",
info="Base generates random voices"
)
# Speaker selector (shown only if model has speakers)
# Pre-populate all available speakers for example table rendering
all_speakers = []
for _cfg in models_configs.values():
if _cfg and _cfg.get('speaker_id'):
all_speakers.extend(list(_cfg.speaker_id.keys()))
all_speakers = sorted(list(set(all_speakers)))
speaker_dropdown = gr.Dropdown(
choices=all_speakers,
value=None,
label="Speaker",
visible=False,
allow_custom_value=True
)
text_input = gr.Textbox(
label="Text",
placeholder="Enter your text ...",
lines=3,
max_lines=10
)
with gr.Accordion("Settings", open=False):
temp = gr.Slider(
minimum=0.1, maximum=1.5, value=1.4, step=0.05,
label="Temp",
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
label="Top P",
)
rp = gr.Slider(
minimum=1.0, maximum=2.0, value=1.1, step=0.05,
label="Repetition Penalty",
)
max_tok = gr.Slider(
minimum=100, maximum=2000, value=1200, step=100,
label="Max Tokens",
)
generate_btn = gr.Button("Run", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Audio",
type="numpy"
)
time_report_output = gr.Textbox(
label="Time Report",
interactive=False,
value="Ready to generate speech",
lines=3
)
# Update speakers when model changes
def update_speakers(model_choice):
cfg = models_configs.get(model_choice)
speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else []
if speakers:
return gr.update(choices=speakers, value=speakers[0], visible=True)
else:
return gr.update(choices=[], value=None, visible=False)
model_dropdown.change(
fn=update_speakers,
inputs=[model_dropdown],
outputs=[speaker_dropdown]
)
# Populate speakers on initial page load based on default model
demo.load(
fn=update_speakers,
inputs=[model_dropdown],
outputs=[speaker_dropdown]
)
# GPU generation event
generate_btn.click(
fn=generate_speech_gpu,
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
outputs=[audio_output, time_report_output]
)
with gr.Row():
examples = examples
gr.Examples(
examples=examples,
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
fn=generate_speech_gpu,
outputs=[audio_output, time_report_output],
cache_examples=True,
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True
)