Spaces:
Running
on
Zero
Running
on
Zero
Den Pavloff
commited on
Commit
·
eb18e14
1
Parent(s):
2759e04
multispeaker, multilang
Browse files- app.py +60 -64
- create_env.py +21 -0
- examples.yaml +98 -0
- model_config.yaml +36 -0
- requirements.txt +2 -1
- util.py +178 -17
app.py
CHANGED
@@ -1,65 +1,32 @@
|
|
1 |
-
import
|
2 |
-
import subprocess
|
3 |
-
import sys
|
4 |
-
|
5 |
-
# Fix OMP_NUM_THREADS issue before any imports
|
6 |
-
os.environ["OMP_NUM_THREADS"] = "4"
|
7 |
|
8 |
-
# Install dependencies programmatically to avoid conflicts
|
9 |
-
def setup_dependencies():
|
10 |
-
try:
|
11 |
-
# Check if already installed
|
12 |
-
if os.path.exists('/tmp/deps_installed'):
|
13 |
-
return
|
14 |
-
|
15 |
-
print("Installing transformers dev version...")
|
16 |
-
subprocess.check_call([
|
17 |
-
sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
|
18 |
-
"git+https://github.com/huggingface/transformers.git"
|
19 |
-
])
|
20 |
-
|
21 |
-
# Mark as installed
|
22 |
-
with open('/tmp/deps_installed', 'w') as f:
|
23 |
-
f.write('done')
|
24 |
-
|
25 |
-
except Exception as e:
|
26 |
-
print(f"Dependencies setup error: {e}")
|
27 |
-
|
28 |
-
# Run setup
|
29 |
setup_dependencies()
|
30 |
|
31 |
import spaces
|
32 |
import gradio as gr
|
33 |
-
from util import
|
34 |
import numpy as np
|
35 |
import torch
|
|
|
36 |
|
37 |
# Get HuggingFace token
|
38 |
token_ = os.getenv('HF_TOKEN')
|
39 |
|
40 |
-
|
41 |
-
models_configs =
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
model_name='nineninesix/kani-tts-450m-0.1-ft',
|
48 |
-
)
|
49 |
-
}
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
models =
|
54 |
-
for model_name, config in models_configs.items():
|
55 |
-
print(f"Loading {model_name}...")
|
56 |
-
models[model_name] = KaniModel(config, player, token_)
|
57 |
-
print(f"{model_name} loaded!")
|
58 |
-
print("All models loaded!")
|
59 |
|
60 |
|
61 |
@spaces.GPU
|
62 |
-
def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
|
63 |
"""
|
64 |
Generate speech from text using the selected model on GPU
|
65 |
"""
|
@@ -71,16 +38,19 @@ def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
|
|
71 |
return None, "Please select a model."
|
72 |
|
73 |
try:
|
74 |
-
# Check GPU availability
|
75 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
76 |
print(f"Using device: {device}")
|
77 |
|
78 |
-
# Get selected model
|
79 |
selected_model = models[model_choice]
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
-
# Generate audio
|
82 |
print(f"Generating speech with {model_choice}...")
|
83 |
-
audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
|
84 |
|
85 |
sample_rate = 22050
|
86 |
print("Speech generation completed!")
|
@@ -104,6 +74,20 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default())
|
|
104 |
label="Selected Model",
|
105 |
info="Base generates random voices"
|
106 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
text_input = gr.Textbox(
|
109 |
label="Text",
|
@@ -146,30 +130,42 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default())
|
|
146 |
lines=3
|
147 |
)
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
# GPU generation event
|
150 |
generate_btn.click(
|
151 |
fn=generate_speech_gpu,
|
152 |
-
inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
|
153 |
outputs=[audio_output, time_report_output]
|
154 |
)
|
155 |
|
156 |
with gr.Row():
|
157 |
|
158 |
-
examples =
|
159 |
-
["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 1.4, 0.95, 1.1, 1200],
|
160 |
-
["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 1.4, 0.95, 1.1, 1200],
|
161 |
-
["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 1.4, 0.95, 1.1, 1200],
|
162 |
-
["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 1.4, 0.95, 1.1, 1200],
|
163 |
-
["Holy fu- Oh my God! Don't you understand how dangerous it is?", "male", 1.4, 0.95, 1.1, 1200],
|
164 |
-
["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 1.4, 0.95, 1.1, 1200],
|
165 |
-
["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "base", 1.4, 0.95, 1.1, 1200],
|
166 |
-
["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 1.4, 0.95, 1.1, 1200],
|
167 |
-
]
|
168 |
-
|
169 |
|
170 |
gr.Examples(
|
171 |
examples=examples,
|
172 |
-
inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
|
173 |
fn=generate_speech_gpu,
|
174 |
outputs=[audio_output, time_report_output],
|
175 |
cache_examples=True,
|
|
|
1 |
+
from create_env import setup_dependencies
|
|
|
|
|
|
|
|
|
|
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
setup_dependencies()
|
4 |
|
5 |
import spaces
|
6 |
import gradio as gr
|
7 |
+
from util import NemoAudioPlayer, InitModels, load_config, Examples
|
8 |
import numpy as np
|
9 |
import torch
|
10 |
+
import os
|
11 |
|
12 |
# Get HuggingFace token
|
13 |
token_ = os.getenv('HF_TOKEN')
|
14 |
|
15 |
+
config = load_config("./model_config.yaml")
|
16 |
+
models_configs = config.models
|
17 |
+
nemo_player_cfg = config.nemo_player
|
18 |
+
|
19 |
+
examples_cfg = load_config("./examples.yaml")
|
20 |
+
examples_maker = Examples(examples_cfg)
|
21 |
+
examples = examples_maker()
|
|
|
|
|
|
|
22 |
|
23 |
+
player = NemoAudioPlayer(nemo_player_cfg)
|
24 |
+
init_models = InitModels(models_configs, player, token_)
|
25 |
+
models = init_models()
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
@spaces.GPU
|
29 |
+
def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok):
|
30 |
"""
|
31 |
Generate speech from text using the selected model on GPU
|
32 |
"""
|
|
|
38 |
return None, "Please select a model."
|
39 |
|
40 |
try:
|
|
|
41 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
42 |
print(f"Using device: {device}")
|
43 |
|
|
|
44 |
selected_model = models[model_choice]
|
45 |
+
cfg = models_configs.get(model_choice)
|
46 |
+
speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
|
47 |
+
if speaker_display and speaker_map:
|
48 |
+
speaker_id = speaker_map.get(speaker_display)
|
49 |
+
else:
|
50 |
+
speaker_id = None
|
51 |
|
|
|
52 |
print(f"Generating speech with {model_choice}...")
|
53 |
+
audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok)
|
54 |
|
55 |
sample_rate = 22050
|
56 |
print("Speech generation completed!")
|
|
|
74 |
label="Selected Model",
|
75 |
info="Base generates random voices"
|
76 |
)
|
77 |
+
# Speaker selector (shown only if model has speakers)
|
78 |
+
# Pre-populate all available speakers for example table rendering
|
79 |
+
all_speakers = []
|
80 |
+
for _cfg in models_configs.values():
|
81 |
+
if _cfg and _cfg.get('speaker_id'):
|
82 |
+
all_speakers.extend(list(_cfg.speaker_id.keys()))
|
83 |
+
all_speakers = sorted(list(set(all_speakers)))
|
84 |
+
speaker_dropdown = gr.Dropdown(
|
85 |
+
choices=all_speakers,
|
86 |
+
value=None,
|
87 |
+
label="Speaker",
|
88 |
+
visible=False,
|
89 |
+
allow_custom_value=True
|
90 |
+
)
|
91 |
|
92 |
text_input = gr.Textbox(
|
93 |
label="Text",
|
|
|
130 |
lines=3
|
131 |
)
|
132 |
|
133 |
+
# Update speakers when model changes
|
134 |
+
def update_speakers(model_choice):
|
135 |
+
cfg = models_configs.get(model_choice)
|
136 |
+
speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else []
|
137 |
+
if speakers:
|
138 |
+
return gr.update(choices=speakers, value=speakers[0], visible=True)
|
139 |
+
else:
|
140 |
+
return gr.update(choices=[], value=None, visible=False)
|
141 |
+
|
142 |
+
model_dropdown.change(
|
143 |
+
fn=update_speakers,
|
144 |
+
inputs=[model_dropdown],
|
145 |
+
outputs=[speaker_dropdown]
|
146 |
+
)
|
147 |
+
|
148 |
+
# Populate speakers on initial page load based on default model
|
149 |
+
demo.load(
|
150 |
+
fn=update_speakers,
|
151 |
+
inputs=[model_dropdown],
|
152 |
+
outputs=[speaker_dropdown]
|
153 |
+
)
|
154 |
+
|
155 |
# GPU generation event
|
156 |
generate_btn.click(
|
157 |
fn=generate_speech_gpu,
|
158 |
+
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
|
159 |
outputs=[audio_output, time_report_output]
|
160 |
)
|
161 |
|
162 |
with gr.Row():
|
163 |
|
164 |
+
examples = examples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
gr.Examples(
|
167 |
examples=examples,
|
168 |
+
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
|
169 |
fn=generate_speech_gpu,
|
170 |
outputs=[audio_output, time_report_output],
|
171 |
cache_examples=True,
|
create_env.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import sys
|
4 |
+
|
5 |
+
def setup_dependencies():
|
6 |
+
os.environ["OMP_NUM_THREADS"] = "4"
|
7 |
+
try:
|
8 |
+
if os.path.exists('/tmp/deps_installed'):
|
9 |
+
return
|
10 |
+
|
11 |
+
print("Installing transformers dev version...")
|
12 |
+
subprocess.check_call([
|
13 |
+
sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
|
14 |
+
"git+https://github.com/huggingface/transformers.git"
|
15 |
+
])
|
16 |
+
|
17 |
+
with open('/tmp/deps_installed', 'w') as f:
|
18 |
+
f.write('done')
|
19 |
+
|
20 |
+
except Exception as e:
|
21 |
+
print(f"Dependencies setup error: {e}")
|
examples.yaml
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
examples:
|
2 |
+
- text: >-
|
3 |
+
Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?
|
4 |
+
speaker_id: "Puck (EN Gemini)"
|
5 |
+
model: "KaniTTS"
|
6 |
+
temperature: 1.4
|
7 |
+
top_p: 0.95
|
8 |
+
repetition_penalty: 1.1
|
9 |
+
max_len: 1200
|
10 |
+
|
11 |
+
- text: >-
|
12 |
+
No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...
|
13 |
+
speaker_id: "Kore (EN Gemini)"
|
14 |
+
model: "KaniTTS"
|
15 |
+
temperature: 1.4
|
16 |
+
top_p: 0.95
|
17 |
+
repetition_penalty: 1.1
|
18 |
+
max_len: 1200
|
19 |
+
|
20 |
+
- text: >-
|
21 |
+
Holy fu* Oh my God! Don't you understand how dangerous it is, huh?
|
22 |
+
speaker_id: "Andrew (EN)"
|
23 |
+
model: "KaniTTS"
|
24 |
+
temperature: 1.4
|
25 |
+
top_p: 0.95
|
26 |
+
repetition_penalty: 1.1
|
27 |
+
max_len: 1200
|
28 |
+
|
29 |
+
- text: >-
|
30 |
+
Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.
|
31 |
+
speaker_id: "David (EN British)"
|
32 |
+
model: "KaniTTS"
|
33 |
+
temperature: 1.4
|
34 |
+
top_p: 0.95
|
35 |
+
repetition_penalty: 1.1
|
36 |
+
max_len: 1200
|
37 |
+
|
38 |
+
- text: >-
|
39 |
+
I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.
|
40 |
+
speaker_id: "Jenny (EN Irish)"
|
41 |
+
model: "KaniTTS"
|
42 |
+
temperature: 1.4
|
43 |
+
top_p: 0.95
|
44 |
+
repetition_penalty: 1.1
|
45 |
+
max_len: 1200
|
46 |
+
|
47 |
+
|
48 |
+
- text: >-
|
49 |
+
Der Dompfaff ist ein kleiner Fink, der im Winter oft in Gärten zu sehen ist.
|
50 |
+
speaker_id: "Thorsten (DE Hessisch)"
|
51 |
+
model: "KaniTTS"
|
52 |
+
temperature: 1.4
|
53 |
+
top_p: 0.95
|
54 |
+
repetition_penalty: 1.1
|
55 |
+
max_len: 1200
|
56 |
+
|
57 |
+
- text: >-
|
58 |
+
하얀 눈 위의 빨간 점 하나가 아침을 엽서처럼 만든다.
|
59 |
+
speaker_id: "Seulgi (KO)"
|
60 |
+
model: "KaniTTS"
|
61 |
+
temperature: 1.4
|
62 |
+
top_p: 0.95
|
63 |
+
repetition_penalty: 1.1
|
64 |
+
max_len: 1200
|
65 |
+
|
66 |
+
- text: >-
|
67 |
+
这种小雀鸟在冬季常见于树林与花园。
|
68 |
+
speaker_id: "Ming (ZH Shanghai OpenAI)"
|
69 |
+
model: "KaniTTS"
|
70 |
+
temperature: 1.4
|
71 |
+
top_p: 0.95
|
72 |
+
repetition_penalty: 1.1
|
73 |
+
max_len: 1200
|
74 |
+
|
75 |
+
- text: >-
|
76 |
+
طائرٌ صغير يُرى كثيرًا في حدائق الشتاء والغابات.
|
77 |
+
speaker_id: "Karim (AR)"
|
78 |
+
model: "KaniTTS"
|
79 |
+
temperature: 1.4
|
80 |
+
top_p: 0.95
|
81 |
+
repetition_penalty: 1.1
|
82 |
+
max_len: 1200
|
83 |
+
|
84 |
+
- text: >-
|
85 |
+
Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?
|
86 |
+
model: "Base Model v.0.2"
|
87 |
+
temperature: 1.4
|
88 |
+
top_p: 0.95
|
89 |
+
repetition_penalty: 1.1
|
90 |
+
max_len: 1200
|
91 |
+
|
92 |
+
- text: >-
|
93 |
+
Colleges of Oxford, Cambridge, Durham and the University of the Highlands and Islands UHI are 'listed bodies', as bodies that appear to the Secretary of State to be constituent colleges, schools, halls or other institutions of a university.
|
94 |
+
model: "Base Model v.0.2"
|
95 |
+
temperature: 1.4
|
96 |
+
top_p: 0.95
|
97 |
+
repetition_penalty: 1.1
|
98 |
+
max_len: 1200
|
model_config.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nemo_player:
|
2 |
+
audiocodec_name: nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps
|
3 |
+
tokeniser_length: 64400
|
4 |
+
start_of_text: 1
|
5 |
+
end_of_text: 2
|
6 |
+
|
7 |
+
models:
|
8 |
+
|
9 |
+
"KaniTTS":
|
10 |
+
model_name: nineninesix/kani-tts-370m
|
11 |
+
device_map: auto
|
12 |
+
speaker_id:
|
13 |
+
"David (EN British)": david
|
14 |
+
"Puck (EN Gemini)": puck
|
15 |
+
"Kore (EN Gemini)": kore
|
16 |
+
"Andrew (EN)": andrew
|
17 |
+
"Jenny (EN Irish)": jenny
|
18 |
+
"Simon (EN Unstable)": simon
|
19 |
+
"Katie (EN Unstable)": katie
|
20 |
+
"Seulgi (KO)": seulgi
|
21 |
+
"Bert (DE)": bert
|
22 |
+
"Thorsten (DE Hessisch)": thorsten
|
23 |
+
"Maria (ES)": maria
|
24 |
+
"Mei (ZH Cantonese)": mei
|
25 |
+
"Ming (ZH Shanghai OpenAI)": ming
|
26 |
+
"Karim (AR)": karim
|
27 |
+
"Nur (AR)": nur
|
28 |
+
|
29 |
+
"Base Model v.0.2":
|
30 |
+
model_name: nineninesix/kani-tts-450m-0.2-pt
|
31 |
+
device_map: auto
|
32 |
+
|
33 |
+
"Base Model v.0.1":
|
34 |
+
model_name: nineninesix/kani-tts-450m-0.1-pt
|
35 |
+
device_map: auto
|
36 |
+
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ torch==2.8.0
|
|
2 |
librosa==0.11.0
|
3 |
nemo_toolkit[tts]==2.4.0
|
4 |
numpy==1.26.4
|
5 |
-
gradio>=4.0.0
|
|
|
|
2 |
librosa==0.11.0
|
3 |
nemo_toolkit[tts]==2.4.0
|
4 |
numpy==1.26.4
|
5 |
+
gradio>=4.0.0
|
6 |
+
omegaconf==2.3.0
|
util.py
CHANGED
@@ -3,26 +3,66 @@ import librosa
|
|
3 |
import requests
|
4 |
import time
|
5 |
from nemo.collections.tts.models import AudioCodecModel
|
6 |
-
from dataclasses import dataclass
|
7 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
8 |
import os
|
|
|
9 |
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
23 |
|
24 |
|
25 |
class NemoAudioPlayer:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def __init__(self, config, text_tokenizer_name: str = None) -> None:
|
27 |
self.conf = config
|
28 |
print(f"Loading NeMo codec model: {self.conf.audiocodec_name}")
|
@@ -130,6 +170,41 @@ class NemoAudioPlayer:
|
|
130 |
|
131 |
|
132 |
class KaniModel:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
def __init__(self, config, player: NemoAudioPlayer, token: str) -> None:
|
134 |
self.conf = config
|
135 |
self.player = player
|
@@ -155,14 +230,17 @@ class KaniModel:
|
|
155 |
|
156 |
print(f"Model loaded successfully on device: {next(self.model.parameters()).device}")
|
157 |
|
158 |
-
def get_input_ids(self, text_prompt: str) -> tuple[torch.tensor]:
|
159 |
"""Prepare input tokens for the model"""
|
160 |
START_OF_HUMAN = self.player.start_of_human
|
161 |
END_OF_TEXT = self.player.end_of_text
|
162 |
END_OF_HUMAN = self.player.end_of_human
|
163 |
|
164 |
# Tokenize input text
|
165 |
-
|
|
|
|
|
|
|
166 |
|
167 |
# Add special tokens
|
168 |
start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64)
|
@@ -207,10 +285,10 @@ class KaniModel:
|
|
207 |
report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
|
208 |
return report
|
209 |
|
210 |
-
def run_model(self, text: str, t: float, top_p: float, rp: float, max_tok: int):
|
211 |
"""Complete pipeline: text -> tokens -> generation -> audio"""
|
212 |
# Prepare input
|
213 |
-
input_ids, attention_mask = self.get_input_ids(text)
|
214 |
|
215 |
# Generate tokens
|
216 |
point_1 = time.time()
|
@@ -223,3 +301,86 @@ class KaniModel:
|
|
223 |
point_3 = time.time()
|
224 |
return audio, text, self.time_report(point_1, point_2, point_3)
|
225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import requests
|
4 |
import time
|
5 |
from nemo.collections.tts.models import AudioCodecModel
|
|
|
6 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
7 |
import os
|
8 |
+
from omegaconf import OmegaConf, DictConfig
|
9 |
|
10 |
|
11 |
+
def load_config(config_path: str):
|
12 |
+
"""Load configuration from a YAML file using OmegaConf.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
config_path (str): Path to the YAML configuration file.
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
Any: The loaded OmegaConf DictConfig.
|
19 |
+
"""
|
20 |
+
resolved_path = os.path.abspath(config_path)
|
21 |
+
if not os.path.exists(resolved_path):
|
22 |
+
raise FileNotFoundError(f"Config file not found: {resolved_path}")
|
23 |
+
config = OmegaConf.load(resolved_path)
|
24 |
+
return config
|
25 |
|
26 |
|
27 |
class NemoAudioPlayer:
|
28 |
+
|
29 |
+
"""
|
30 |
+
High-level audio reconstruction helper built on NeMo Nano Codec.
|
31 |
+
|
32 |
+
This class converts discrete codec token sequences produced by the
|
33 |
+
language model into time-domain audio waveforms using
|
34 |
+
`nemo.collections.tts.models.AudioCodecModel`. It also optionally
|
35 |
+
handles extraction/decoding of text spans from the generated token
|
36 |
+
stream when a compatible text tokenizer is provided.
|
37 |
+
|
38 |
+
Parameters
|
39 |
+
----------
|
40 |
+
config : OmegaConf | DictConfig
|
41 |
+
Configuration block under `nemo_player` from `model_config.yaml`.
|
42 |
+
Expected fields:
|
43 |
+
- `audiocodec_name` (str): HuggingFace model id for NeMo codec
|
44 |
+
- `tokeniser_length` (int): Size of the base tokenizer vocabulary
|
45 |
+
- `start_of_text`, `end_of_text` (int): Special text token ids
|
46 |
+
text_tokenizer_name : str, optional
|
47 |
+
HF repo id or local path of the tokenizer used by the LLM. If
|
48 |
+
provided, the player can also extract and decode the text segment
|
49 |
+
embedded in the generated ids for debugging/inspection.
|
50 |
+
|
51 |
+
Notes
|
52 |
+
-----
|
53 |
+
- The class defines a fixed layout of special token ids derived from
|
54 |
+
`tokeniser_length`. Audio codes are expected to be arranged in 4
|
55 |
+
interleaved codebooks (q=4). See `get_nano_codes` for validation.
|
56 |
+
- Device selection is automatic (`cuda` if available else `cpu`).
|
57 |
+
|
58 |
+
Typical Usage
|
59 |
+
-------------
|
60 |
+
1) The model generates a sequence of token ids that contains both text
|
61 |
+
and audio sections delimited by special markers.
|
62 |
+
2) Call `get_waveform(model_output_ids)` to obtain a NumPy waveform
|
63 |
+
ready to be played or saved.
|
64 |
+
"""
|
65 |
+
|
66 |
def __init__(self, config, text_tokenizer_name: str = None) -> None:
|
67 |
self.conf = config
|
68 |
print(f"Loading NeMo codec model: {self.conf.audiocodec_name}")
|
|
|
170 |
|
171 |
|
172 |
class KaniModel:
|
173 |
+
|
174 |
+
"""
|
175 |
+
Wrapper around a causal LLM that emits NeMo codec tokens for TTS.
|
176 |
+
|
177 |
+
Responsibilities
|
178 |
+
-----------------
|
179 |
+
- Load the LLM and tokenizer from HuggingFace with the provided
|
180 |
+
configuration (model id, device mapping, auth token, and
|
181 |
+
`trust_remote_code`).
|
182 |
+
- Prepare inputs by injecting conversation and modality control tokens
|
183 |
+
expected by the decoder (`START_OF_HUMAN`, `END_OF_TEXT`, etc.), and
|
184 |
+
optionally prefix the input with a speaker id tag.
|
185 |
+
- Perform generation with sampling parameters and return raw token ids.
|
186 |
+
- Delegate waveform reconstruction to `NemoAudioPlayer`.
|
187 |
+
|
188 |
+
Parameters
|
189 |
+
----------
|
190 |
+
config : OmegaConf | DictConfig
|
191 |
+
Model configuration block from `models[...]` in `model_config.yaml`.
|
192 |
+
Expected fields:
|
193 |
+
- `model_name` (str): HF repo id of the LLM
|
194 |
+
- `device_map` (str | dict): Device mapping strategy for HF
|
195 |
+
player : NemoAudioPlayer
|
196 |
+
Audio decoder that turns generated token ids into waveform.
|
197 |
+
token : str
|
198 |
+
HuggingFace access token (if the model requires authentication).
|
199 |
+
|
200 |
+
Key Methods
|
201 |
+
-----------
|
202 |
+
- `get_input_ids(text, speaker_id)`: builds the prompt with control
|
203 |
+
tokens and returns `(input_ids, attention_mask)` tensors.
|
204 |
+
- `model_request(...)`: runs `generate` with sampling controls.
|
205 |
+
- `run_model(...)`: end-to-end pipeline returning `(audio, text, report)`.
|
206 |
+
"""
|
207 |
+
|
208 |
def __init__(self, config, player: NemoAudioPlayer, token: str) -> None:
|
209 |
self.conf = config
|
210 |
self.player = player
|
|
|
230 |
|
231 |
print(f"Model loaded successfully on device: {next(self.model.parameters()).device}")
|
232 |
|
233 |
+
def get_input_ids(self, text_prompt: str, speaker_id:str) -> tuple[torch.tensor]:
|
234 |
"""Prepare input tokens for the model"""
|
235 |
START_OF_HUMAN = self.player.start_of_human
|
236 |
END_OF_TEXT = self.player.end_of_text
|
237 |
END_OF_HUMAN = self.player.end_of_human
|
238 |
|
239 |
# Tokenize input text
|
240 |
+
if speaker_id is not None:
|
241 |
+
input_ids = self.tokenizer(f"{speaker_id}: {text_prompt}", return_tensors="pt").input_ids
|
242 |
+
else:
|
243 |
+
input_ids = self.tokenizer(text_prompt, return_tensors="pt").input_ids
|
244 |
|
245 |
# Add special tokens
|
246 |
start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64)
|
|
|
285 |
report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
|
286 |
return report
|
287 |
|
288 |
+
def run_model(self, text: str, speaker_id:str, t: float, top_p: float, rp: float, max_tok: int):
|
289 |
"""Complete pipeline: text -> tokens -> generation -> audio"""
|
290 |
# Prepare input
|
291 |
+
input_ids, attention_mask = self.get_input_ids(text, speaker_id)
|
292 |
|
293 |
# Generate tokens
|
294 |
point_1 = time.time()
|
|
|
301 |
point_3 = time.time()
|
302 |
return audio, text, self.time_report(point_1, point_2, point_3)
|
303 |
|
304 |
+
class InitModels:
|
305 |
+
|
306 |
+
"""
|
307 |
+
Lazy initializer that constructs a map of model name -> KaniModel.
|
308 |
+
|
309 |
+
Parameters
|
310 |
+
----------
|
311 |
+
models_configs : OmegaConf | DictConfig
|
312 |
+
The `models` section from `model_config.yaml` describing one or
|
313 |
+
more HF LLM checkpoints and their options (device map, speakers).
|
314 |
+
player : NemoAudioPlayer
|
315 |
+
Shared audio decoder instance reused across all models.
|
316 |
+
token_ : str
|
317 |
+
HuggingFace token passed to each `KaniModel` for loading.
|
318 |
+
|
319 |
+
Returns
|
320 |
+
-------
|
321 |
+
dict
|
322 |
+
When called, returns a dictionary `{model_name: KaniModel}`.
|
323 |
+
|
324 |
+
Notes
|
325 |
+
-----
|
326 |
+
- All models are loaded immediately in `__call__` so the UI can list
|
327 |
+
them and switch between them without extra latency.
|
328 |
+
"""
|
329 |
+
|
330 |
+
def __init__(self, models_configs:OmegaConf, player: NemoAudioPlayer, token_:str):
|
331 |
+
self.models_configs = models_configs
|
332 |
+
self.player = player
|
333 |
+
self.token_ = token_
|
334 |
+
|
335 |
+
def __call__(self):
|
336 |
+
models = {}
|
337 |
+
for model_name, config in self.models_configs.items():
|
338 |
+
print(f"Loading {model_name}...")
|
339 |
+
models[model_name] = KaniModel(config, self.player, self.token_)
|
340 |
+
print(f"{model_name} loaded!")
|
341 |
+
print("All models loaded!")
|
342 |
+
return models
|
343 |
+
|
344 |
+
class Examples:
|
345 |
+
|
346 |
+
"""
|
347 |
+
Adapter that converts YAML examples into Gradio `gr.Examples` rows.
|
348 |
+
|
349 |
+
Parameters
|
350 |
+
----------
|
351 |
+
exam_cfg : OmegaConf | DictConfig
|
352 |
+
Parsed contents of `examples.yaml`. Expected structure:
|
353 |
+
`examples: [ {text, speaker_id?, model, temperature?, top_p?,
|
354 |
+
repetition_penalty?, max_len?}, ... ]`.
|
355 |
+
|
356 |
+
Behavior
|
357 |
+
--------
|
358 |
+
- Produces a list-of-lists whose order must match the `inputs` order
|
359 |
+
used when constructing `gr.Examples` in `app.py`.
|
360 |
+
- Current order: `[text, model_dropdown, speaker_dropdown, temp,
|
361 |
+
top_p, rp, max_tok]`.
|
362 |
+
|
363 |
+
Why this exists
|
364 |
+
---------------
|
365 |
+
- Keeps format and defaults centralized, so changing the UI inputs
|
366 |
+
order only requires a single change here and in `app.py`.
|
367 |
+
"""
|
368 |
+
|
369 |
+
def __init__(self, exam_cfg: OmegaConf):
|
370 |
+
self.exam_cfg = exam_cfg
|
371 |
+
|
372 |
+
def __call__(self)->list[list]:
|
373 |
+
rows = []
|
374 |
+
for e in self.exam_cfg.examples:
|
375 |
+
text = e.get("text")
|
376 |
+
speaker_id = e.get("speaker_id")
|
377 |
+
model = e.get("model")
|
378 |
+
temperature = e.get("temperature", 1.4)
|
379 |
+
top_p = e.get("top_p", 0.95)
|
380 |
+
repetition_penalty = e.get("repetition_penalty", 1.1)
|
381 |
+
max_len = e.get("max_len", 1200)
|
382 |
+
# Order must match gr.Examples inputs: [text, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok]
|
383 |
+
rows.append([text, model, speaker_id, temperature, top_p, repetition_penalty, max_len])
|
384 |
+
|
385 |
+
return rows
|
386 |
+
|