Spaces:
Running
on
Zero
Running
on
Zero
Harry Coultas Blum
commited on
Commit
·
3d69f83
1
Parent(s):
84d326d
trying to cast
Browse files- app.py +5 -5
- inference.py +11 -6
- vui/inference.py +9 -7
app.py
CHANGED
@@ -19,7 +19,8 @@ def get_available_models():
|
|
19 |
return models
|
20 |
|
21 |
|
22 |
-
AVAILABLE_MODELS = get_available_models()
|
|
|
23 |
print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
|
24 |
|
25 |
current_model = None
|
@@ -39,7 +40,7 @@ def load_and_warm_model(model_name):
|
|
39 |
model = Vui.from_pretrained_inf(model_path).cuda()
|
40 |
|
41 |
print(f"Compiling model {model_name}...")
|
42 |
-
model.decoder = torch.compile(model.decoder, fullgraph=True)
|
43 |
|
44 |
print(f"Warming up model {model_name}...")
|
45 |
warmup_text = "Hello, this is a test. Let's say some random shizz"
|
@@ -169,7 +170,7 @@ def load_sample_text(sample_index):
|
|
169 |
|
170 |
|
171 |
# Create Gradio interface
|
172 |
-
|
173 |
title="Vui",
|
174 |
theme=gr.themes.Soft(),
|
175 |
head="""
|
@@ -383,5 +384,4 @@ document.addEventListener('DOMContentLoaded', function() {
|
|
383 |
outputs=[text_input, audio_output, info_output],
|
384 |
)
|
385 |
|
386 |
-
|
387 |
-
demo.launch(server_name="0.0.0.0", share=True)
|
|
|
19 |
return models
|
20 |
|
21 |
|
22 |
+
# AVAILABLE_MODELS = get_available_models()
|
23 |
+
AVAILABLE_MODELS = {"COHOST": Vui.COHOST}
|
24 |
print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
|
25 |
|
26 |
current_model = None
|
|
|
40 |
model = Vui.from_pretrained_inf(model_path).cuda()
|
41 |
|
42 |
print(f"Compiling model {model_name}...")
|
43 |
+
# model.decoder = torch.compile(model.decoder, fullgraph=True)
|
44 |
|
45 |
print(f"Warming up model {model_name}...")
|
46 |
warmup_text = "Hello, this is a test. Let's say some random shizz"
|
|
|
170 |
|
171 |
|
172 |
# Create Gradio interface
|
173 |
+
gradio_interface = gr.Blocks(
|
174 |
title="Vui",
|
175 |
theme=gr.themes.Soft(),
|
176 |
head="""
|
|
|
384 |
outputs=[text_input, audio_output, info_output],
|
385 |
)
|
386 |
|
387 |
+
demo.launch()
|
|
inference.py
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
import torchaudio
|
|
|
2 |
|
3 |
from vui.inference import render
|
4 |
from vui.model import Vui
|
5 |
|
6 |
model = Vui.from_pretrained().cuda()
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
1 |
import torchaudio
|
2 |
+
import torch
|
3 |
|
4 |
from vui.inference import render
|
5 |
from vui.model import Vui
|
6 |
|
7 |
model = Vui.from_pretrained().cuda()
|
8 |
+
model.decoder = torch.compile(model.decoder, fullgraph=True, mode="max-autotune")
|
9 |
+
for i in range(10):
|
10 |
+
waveform = render(
|
11 |
+
model,
|
12 |
+
"""Hey, here is some random stuff, usually something quite long as the shorter the text the less likely the model can cope!
|
13 |
+
So cool yeah makes sense, would you be able to help me with something?
|
14 |
+
Sure what is it?""",
|
15 |
+
)
|
16 |
+
print(waveform.shape)
|
17 |
+
torchaudio.save("out.opus", waveform[0].cpu(), 22050)
|
vui/inference.py
CHANGED
@@ -83,7 +83,7 @@ def replace_numbers_with_words(text):
|
|
83 |
return re.sub(r"\d+", number_to_words, text)
|
84 |
|
85 |
|
86 |
-
valid_non_speech = ["breath", "sigh", "laugh", "tut", "hesitate"]
|
87 |
valid_non_speech = [f"[{v}]" for v in valid_non_speech]
|
88 |
|
89 |
|
@@ -316,24 +316,26 @@ def render(
|
|
316 |
Render audio from text. Uses generate for text < 1000 characters,
|
317 |
otherwise breaks text into sections and uses chunking with context.
|
318 |
"""
|
|
|
|
|
|
|
319 |
text = remove_all_invalid_non_speech(text)
|
320 |
text = simple_clean(text)
|
321 |
SR = self.codec.config.sample_rate
|
322 |
HZ = self.codec.hz
|
323 |
max_gen_len = int(HZ * max_secs)
|
|
|
324 |
|
325 |
-
if len(text) <
|
326 |
codes = generate(
|
327 |
self, text, prompt_codes, temperature, top_k, top_p, max_gen_len
|
328 |
)
|
329 |
codes = codes[..., :-10]
|
330 |
audio = self.codec.from_indices(codes)
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
# Otherwise we have to do some clever chaining!
|
336 |
-
|
337 |
orig_codes = prompt_codes
|
338 |
|
339 |
lines = text.split("\n")
|
|
|
83 |
return re.sub(r"\d+", number_to_words, text)
|
84 |
|
85 |
|
86 |
+
valid_non_speech = ["breath", "sigh", "laugh", "tut", "hesitate", "clearthroat"]
|
87 |
valid_non_speech = [f"[{v}]" for v in valid_non_speech]
|
88 |
|
89 |
|
|
|
316 |
Render audio from text. Uses generate for text < 1000 characters,
|
317 |
otherwise breaks text into sections and uses chunking with context.
|
318 |
"""
|
319 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
320 |
+
torch.backends.cudnn.allow_tf32 = True
|
321 |
+
|
322 |
text = remove_all_invalid_non_speech(text)
|
323 |
text = simple_clean(text)
|
324 |
SR = self.codec.config.sample_rate
|
325 |
HZ = self.codec.hz
|
326 |
max_gen_len = int(HZ * max_secs)
|
327 |
+
t1 = time.perf_counter()
|
328 |
|
329 |
+
if len(text) < 1400:
|
330 |
codes = generate(
|
331 |
self, text, prompt_codes, temperature, top_k, top_p, max_gen_len
|
332 |
)
|
333 |
codes = codes[..., :-10]
|
334 |
audio = self.codec.from_indices(codes)
|
335 |
+
print("RTF", (audio.numel()/SR)/(time.perf_counter() - t1))
|
336 |
+
return audio.cpu()
|
337 |
+
|
|
|
338 |
# Otherwise we have to do some clever chaining!
|
|
|
339 |
orig_codes = prompt_codes
|
340 |
|
341 |
lines = text.split("\n")
|