Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,949 Bytes
eb18e14 164603c eb18e14 164603c eb18e14 164603c eb18e14 164603c eb18e14 164603c e9bcb5a 164603c eb18e14 164603c eb18e14 164603c eb18e14 164603c 46cf002 164603c 52c0d1f 164603c 52c0d1f 164603c 0488cfb 088ca61 164603c 00e4cff 164603c eb18e14 164603c 0488cfb 164603c 00e4cff ad693da 2759e04 ad693da 00e4cff bb2123e 00e4cff ad693da 71a6999 00e4cff ad693da 00e4cff ad693da 00e4cff 164603c ad693da 0488cfb 164603c 0488cfb 164603c 52c0d1f aa6abd6 52c0d1f 164603c eb18e14 164603c eb18e14 52c0d1f 164603c 46cf002 eb18e14 0488cfb eb18e14 4e3722d 949c8bd 0488cfb 164603c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
from create_env import setup_dependencies
setup_dependencies()
import spaces
import gradio as gr
from util import NemoAudioPlayer, InitModels, load_config, Examples
import numpy as np
import torch
import os
# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')
config = load_config("./model_config.yaml")
models_configs = config.models
nemo_player_cfg = config.nemo_player
examples_cfg = load_config("./examples.yaml")
examples_maker = Examples(examples_cfg)
examples = examples_maker()
player = NemoAudioPlayer(nemo_player_cfg)
init_models = InitModels(models_configs, player, token_)
models = init_models()
@spaces.GPU
def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok):
"""
Generate speech from text using the selected model on GPU
"""
if not text.strip():
return None, "Please enter text for speech generation."
if not model_choice:
return None, "Please select a model."
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
selected_model = models[model_choice]
cfg = models_configs.get(model_choice)
speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
if speaker_display and speaker_map:
speaker_id = speaker_map.get(speaker_display)
else:
speaker_id = None
print(f"Generating speech with {model_choice}...")
audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok)
sample_rate = 22050
print("Speech generation completed!")
return (sample_rate, audio), time_report #, f"β
Audio generated successfully using {model_choice} on {device}"
except Exception as e:
print(f"Error during generation: {str(e)}")
return None, f"β Error during generation: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="π» KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
gr.Markdown("# π» KaniTTS: Fast and Expressive Speech Generation Model")
gr.Markdown("Select a model and enter text to generate emotional speech")
with gr.Row():
with gr.Column(scale=1):
model_dropdown = gr.Dropdown(
choices=list(models_configs.keys()),
value=list(models_configs.keys())[0],
label="Selected Model",
info="Base generates random voices"
)
# Speaker selector (shown only if model has speakers)
# Pre-populate all available speakers for example table rendering
all_speakers = []
for _cfg in models_configs.values():
if _cfg and _cfg.get('speaker_id'):
all_speakers.extend(list(_cfg.speaker_id.keys()))
all_speakers = sorted(list(set(all_speakers)))
speaker_dropdown = gr.Dropdown(
choices=all_speakers,
value=None,
label="Speaker",
visible=False,
allow_custom_value=True
)
text_input = gr.Textbox(
label="Text",
placeholder="Enter your text ...",
lines=3,
max_lines=10
)
with gr.Accordion("Settings", open=False):
temp = gr.Slider(
minimum=0.1, maximum=1.5, value=1.4, step=0.05,
label="Temp",
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
label="Top P",
)
rp = gr.Slider(
minimum=1.0, maximum=2.0, value=1.1, step=0.05,
label="Repetition Penalty",
)
max_tok = gr.Slider(
minimum=100, maximum=2000, value=1200, step=100,
label="Max Tokens",
)
generate_btn = gr.Button("Run", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Audio",
type="numpy"
)
time_report_output = gr.Textbox(
label="Time Report",
interactive=False,
value="Ready to generate speech",
lines=3
)
# Update speakers when model changes
def update_speakers(model_choice):
cfg = models_configs.get(model_choice)
speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else []
if speakers:
return gr.update(choices=speakers, value=speakers[0], visible=True)
else:
return gr.update(choices=[], value=None, visible=False)
model_dropdown.change(
fn=update_speakers,
inputs=[model_dropdown],
outputs=[speaker_dropdown]
)
# Populate speakers on initial page load based on default model
demo.load(
fn=update_speakers,
inputs=[model_dropdown],
outputs=[speaker_dropdown]
)
# GPU generation event
generate_btn.click(
fn=generate_speech_gpu,
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
outputs=[audio_output, time_report_output]
)
with gr.Row():
examples = examples
gr.Examples(
examples=examples,
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
fn=generate_speech_gpu,
outputs=[audio_output, time_report_output],
cache_examples=True,
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True
) |