File size: 5,949 Bytes
eb18e14
164603c
 
 
 
 
eb18e14
164603c
 
eb18e14
164603c
 
 
 
eb18e14
 
 
 
 
 
 
164603c
eb18e14
 
 
164603c
e9bcb5a
164603c
eb18e14
164603c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb18e14
 
 
 
 
 
164603c
 
eb18e14
164603c
46cf002
164603c
 
52c0d1f
164603c
 
 
52c0d1f
164603c
 
0488cfb
088ca61
 
164603c
 
 
 
 
 
00e4cff
 
164603c
eb18e14
 
 
 
 
 
 
 
 
 
 
 
 
 
164603c
 
0488cfb
 
164603c
 
 
00e4cff
 
ad693da
2759e04
ad693da
00e4cff
 
bb2123e
00e4cff
 
ad693da
71a6999
00e4cff
 
ad693da
00e4cff
ad693da
00e4cff
164603c
ad693da
0488cfb
164603c
 
 
0488cfb
164603c
 
 
52c0d1f
 
 
aa6abd6
 
52c0d1f
164603c
eb18e14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164603c
 
 
eb18e14
52c0d1f
164603c
 
46cf002
 
eb18e14
0488cfb
 
 
eb18e14
4e3722d
949c8bd
0488cfb
 
164603c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from create_env import setup_dependencies

setup_dependencies()

import spaces
import gradio as gr
from util import NemoAudioPlayer, InitModels, load_config, Examples
import numpy as np
import torch
import os

# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')

config = load_config("./model_config.yaml")
models_configs = config.models
nemo_player_cfg = config.nemo_player

examples_cfg = load_config("./examples.yaml")
examples_maker = Examples(examples_cfg)
examples = examples_maker()

player = NemoAudioPlayer(nemo_player_cfg)
init_models = InitModels(models_configs, player, token_)
models = init_models()


@spaces.GPU
def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok):
    """
    Generate speech from text using the selected model on GPU
    """
    
    if not text.strip():
        return None, "Please enter text for speech generation."
    
    if not model_choice:
        return None, "Please select a model."
    
    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        selected_model = models[model_choice]
        cfg = models_configs.get(model_choice)
        speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
        if speaker_display and speaker_map:
            speaker_id = speaker_map.get(speaker_display)
        else:
            speaker_id = None
        
        print(f"Generating speech with {model_choice}...")
        audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok)
        
        sample_rate = 22050 
        print("Speech generation completed!")
        
        return (sample_rate, audio), time_report   #, f"βœ… Audio generated successfully using {model_choice} on {device}"
        
    except Exception as e:
        print(f"Error during generation: {str(e)}")
        return None, f"❌ Error during generation: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
    gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
    gr.Markdown("Select a model and enter text to generate emotional speech")
    
    with gr.Row():
        with gr.Column(scale=1):
            model_dropdown = gr.Dropdown(
                choices=list(models_configs.keys()),
                value=list(models_configs.keys())[0],
                label="Selected Model",
                info="Base generates random voices"
            )
            # Speaker selector (shown only if model has speakers)
            # Pre-populate all available speakers for example table rendering
            all_speakers = []
            for _cfg in models_configs.values():
                if _cfg and _cfg.get('speaker_id'):
                    all_speakers.extend(list(_cfg.speaker_id.keys()))
            all_speakers = sorted(list(set(all_speakers)))
            speaker_dropdown = gr.Dropdown(
                choices=all_speakers,
                value=None,
                label="Speaker",
                visible=False,
                allow_custom_value=True
            )
            
            text_input = gr.Textbox(
                label="Text",
                placeholder="Enter your text ...",
                lines=3,
                max_lines=10
            )

            with gr.Accordion("Settings", open=False):
                temp = gr.Slider(
                    minimum=0.1, maximum=1.5, value=1.4, step=0.05,
                    label="Temp", 
                )
                top_p = gr.Slider(
                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
                    label="Top P", 
                )
                rp = gr.Slider(
                    minimum=1.0, maximum=2.0, value=1.1, step=0.05,
                    label="Repetition Penalty", 
                )
                max_tok = gr.Slider(
                    minimum=100, maximum=2000, value=1200, step=100,
                    label="Max Tokens", 
                )
            
            generate_btn = gr.Button("Run", variant="primary", size="lg")

            
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="Generated Audio",
                type="numpy"
            )
            
            time_report_output = gr.Textbox(
                label="Time Report",
                interactive=False,
                value="Ready to generate speech",
                lines=3
            )
    
    # Update speakers when model changes
    def update_speakers(model_choice):
        cfg = models_configs.get(model_choice)
        speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else []
        if speakers:
            return gr.update(choices=speakers, value=speakers[0], visible=True)
        else:
            return gr.update(choices=[], value=None, visible=False)

    model_dropdown.change(
        fn=update_speakers,
        inputs=[model_dropdown],
        outputs=[speaker_dropdown]
    )

    # Populate speakers on initial page load based on default model
    demo.load(
        fn=update_speakers,
        inputs=[model_dropdown],
        outputs=[speaker_dropdown]
    )

    # GPU generation event
    generate_btn.click(
        fn=generate_speech_gpu,
        inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
        outputs=[audio_output, time_report_output]
    )
    
    with gr.Row():

        examples = examples

        gr.Examples(
            examples=examples,
            inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
            fn=generate_speech_gpu,
            outputs=[audio_output, time_report_output],
            cache_examples=True,
        )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )