Harry Coultas Blum commited on
Commit
3d69f83
·
1 Parent(s): 84d326d

trying to cast

Browse files
Files changed (3) hide show
  1. app.py +5 -5
  2. inference.py +11 -6
  3. vui/inference.py +9 -7
app.py CHANGED
@@ -19,7 +19,8 @@ def get_available_models():
19
  return models
20
 
21
 
22
- AVAILABLE_MODELS = get_available_models()
 
23
  print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
24
 
25
  current_model = None
@@ -39,7 +40,7 @@ def load_and_warm_model(model_name):
39
  model = Vui.from_pretrained_inf(model_path).cuda()
40
 
41
  print(f"Compiling model {model_name}...")
42
- model.decoder = torch.compile(model.decoder, fullgraph=True)
43
 
44
  print(f"Warming up model {model_name}...")
45
  warmup_text = "Hello, this is a test. Let's say some random shizz"
@@ -169,7 +170,7 @@ def load_sample_text(sample_index):
169
 
170
 
171
  # Create Gradio interface
172
- with gr.Blocks(
173
  title="Vui",
174
  theme=gr.themes.Soft(),
175
  head="""
@@ -383,5 +384,4 @@ document.addEventListener('DOMContentLoaded', function() {
383
  outputs=[text_input, audio_output, info_output],
384
  )
385
 
386
- if __name__ == "__main__":
387
- demo.launch(server_name="0.0.0.0", share=True)
 
19
  return models
20
 
21
 
22
+ # AVAILABLE_MODELS = get_available_models()
23
+ AVAILABLE_MODELS = {"COHOST": Vui.COHOST}
24
  print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
25
 
26
  current_model = None
 
40
  model = Vui.from_pretrained_inf(model_path).cuda()
41
 
42
  print(f"Compiling model {model_name}...")
43
+ # model.decoder = torch.compile(model.decoder, fullgraph=True)
44
 
45
  print(f"Warming up model {model_name}...")
46
  warmup_text = "Hello, this is a test. Let's say some random shizz"
 
170
 
171
 
172
  # Create Gradio interface
173
+ gradio_interface = gr.Blocks(
174
  title="Vui",
175
  theme=gr.themes.Soft(),
176
  head="""
 
384
  outputs=[text_input, audio_output, info_output],
385
  )
386
 
387
+ demo.launch()
 
inference.py CHANGED
@@ -1,12 +1,17 @@
1
  import torchaudio
 
2
 
3
  from vui.inference import render
4
  from vui.model import Vui
5
 
6
  model = Vui.from_pretrained().cuda()
7
- waveform = render(
8
- model,
9
- "Hey, here is some random stuff, usually something quite long as the shorter the text the less likely the model can cope!",
10
- )
11
- print(waveform.shape)
12
- torchaudio.save("out.opus", waveform[0], 22050)
 
 
 
 
 
1
  import torchaudio
2
+ import torch
3
 
4
  from vui.inference import render
5
  from vui.model import Vui
6
 
7
  model = Vui.from_pretrained().cuda()
8
+ model.decoder = torch.compile(model.decoder, fullgraph=True, mode="max-autotune")
9
+ for i in range(10):
10
+ waveform = render(
11
+ model,
12
+ """Hey, here is some random stuff, usually something quite long as the shorter the text the less likely the model can cope!
13
+ So cool yeah makes sense, would you be able to help me with something?
14
+ Sure what is it?""",
15
+ )
16
+ print(waveform.shape)
17
+ torchaudio.save("out.opus", waveform[0].cpu(), 22050)
vui/inference.py CHANGED
@@ -83,7 +83,7 @@ def replace_numbers_with_words(text):
83
  return re.sub(r"\d+", number_to_words, text)
84
 
85
 
86
- valid_non_speech = ["breath", "sigh", "laugh", "tut", "hesitate"]
87
  valid_non_speech = [f"[{v}]" for v in valid_non_speech]
88
 
89
 
@@ -316,24 +316,26 @@ def render(
316
  Render audio from text. Uses generate for text < 1000 characters,
317
  otherwise breaks text into sections and uses chunking with context.
318
  """
 
 
 
319
  text = remove_all_invalid_non_speech(text)
320
  text = simple_clean(text)
321
  SR = self.codec.config.sample_rate
322
  HZ = self.codec.hz
323
  max_gen_len = int(HZ * max_secs)
 
324
 
325
- if len(text) < 1000:
326
  codes = generate(
327
  self, text, prompt_codes, temperature, top_k, top_p, max_gen_len
328
  )
329
  codes = codes[..., :-10]
330
  audio = self.codec.from_indices(codes)
331
- return audio
332
-
333
- raise Exception("Failed to render")
334
-
335
  # Otherwise we have to do some clever chaining!
336
-
337
  orig_codes = prompt_codes
338
 
339
  lines = text.split("\n")
 
83
  return re.sub(r"\d+", number_to_words, text)
84
 
85
 
86
+ valid_non_speech = ["breath", "sigh", "laugh", "tut", "hesitate", "clearthroat"]
87
  valid_non_speech = [f"[{v}]" for v in valid_non_speech]
88
 
89
 
 
316
  Render audio from text. Uses generate for text < 1000 characters,
317
  otherwise breaks text into sections and uses chunking with context.
318
  """
319
+ torch.backends.cuda.matmul.allow_tf32 = True
320
+ torch.backends.cudnn.allow_tf32 = True
321
+
322
  text = remove_all_invalid_non_speech(text)
323
  text = simple_clean(text)
324
  SR = self.codec.config.sample_rate
325
  HZ = self.codec.hz
326
  max_gen_len = int(HZ * max_secs)
327
+ t1 = time.perf_counter()
328
 
329
+ if len(text) < 1400:
330
  codes = generate(
331
  self, text, prompt_codes, temperature, top_k, top_p, max_gen_len
332
  )
333
  codes = codes[..., :-10]
334
  audio = self.codec.from_indices(codes)
335
+ print("RTF", (audio.numel()/SR)/(time.perf_counter() - t1))
336
+ return audio.cpu()
337
+
 
338
  # Otherwise we have to do some clever chaining!
 
339
  orig_codes = prompt_codes
340
 
341
  lines = text.split("\n")