kemuriririn commited on
Commit
258fb54
·
1 Parent(s): b8368df

(wip)debug

Browse files
Files changed (3) hide show
  1. app.py +11 -1
  2. cosyvoice/cli/cosyvoice.py +0 -18
  3. cosyvoice/cli/frontend.py +0 -5
app.py CHANGED
@@ -86,6 +86,16 @@ def get_cosyvoice():
86
  load_trt=load_trt)
87
  return cosyvoice_instance
88
 
 
 
 
 
 
 
 
 
 
 
89
  @spaces.GPU
90
  def get_asr():
91
  global asr_model
@@ -194,7 +204,7 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
194
  logging.info('get zero_shot inference request')
195
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
196
  set_all_random_seed(seed)
197
- for i in get_cosyvoice().inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
198
  yield (target_sr, i['tts_speech'].numpy().flatten())
199
  elif mode_checkbox_group == 'Cross-lingual Clone':
200
  logging.info('get cross_lingual inference request')
 
86
  load_trt=load_trt)
87
  return cosyvoice_instance
88
 
89
+ @spaces.GPU
90
+ def infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream, speed):
91
+ cosyvoice = get_cosyvoice()
92
+ if cosyvoice.frontend.instruct is True:
93
+ logging.warning('CosyVoice2-0.5B does not support zero-shot inference, please use CosyVoice-300M or CosyVoice-300M-Instruct.')
94
+ return
95
+ for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
96
+ yield i
97
+
98
+
99
  @spaces.GPU
100
  def get_asr():
101
  global asr_model
 
204
  logging.info('get zero_shot inference request')
205
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
206
  set_all_random_seed(seed)
207
+ for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
208
  yield (target_sr, i['tts_speech'].numpy().flatten())
209
  elif mode_checkbox_group == 'Cross-lingual Clone':
210
  logging.info('get cross_lingual inference request')
cosyvoice/cli/cosyvoice.py CHANGED
@@ -23,7 +23,6 @@ from cosyvoice.utils.file_utils import logging
23
  import spaces
24
 
25
  class CosyVoice:
26
- @spaces.GPU
27
  def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
28
  instruct = True if '-Instruct' in model_dir else False
29
  self.instruct = instruct
@@ -56,18 +55,11 @@ class CosyVoice:
56
  if load_onnx:
57
  self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
58
 
59
- @spaces.GPU
60
  def list_avaliable_spks(self):
61
  spks = list(self.frontend.spk2info.keys())
62
  return spks
63
 
64
- @spaces.GPU
65
- def reload_frontend(self):
66
- self.frontend.reload_onnx()
67
-
68
- @spaces.GPU
69
  def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
70
- self.reload_frontend()
71
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
72
  model_input = self.frontend.frontend_sft(i, spk_id)
73
  start_time = time.time()
@@ -78,9 +70,7 @@ class CosyVoice:
78
  yield model_output
79
  start_time = time.time()
80
 
81
- @spaces.GPU
82
  def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
83
- self.reload_frontend()
84
  prompt_text = self.frontend.text_normalize(prompt_text, split=False)
85
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
86
  if len(i) < 0.5 * len(prompt_text):
@@ -94,7 +84,6 @@ class CosyVoice:
94
  yield model_output
95
  start_time = time.time()
96
 
97
- @spaces.GPU
98
  def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
99
  self.reload_frontend()
100
  if self.frontend.instruct is True:
@@ -109,9 +98,7 @@ class CosyVoice:
109
  yield model_output
110
  start_time = time.time()
111
 
112
- @spaces.GPU
113
  def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
114
- self.reload_frontend()
115
  assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
116
  if self.frontend.instruct is False:
117
  raise ValueError('{} do not support instruct inference'.format(self.model_dir))
@@ -126,9 +113,7 @@ class CosyVoice:
126
  yield model_output
127
  start_time = time.time()
128
 
129
- @spaces.GPU
130
  def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
131
- self.reload_frontend()
132
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
133
  model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
134
  start_time = time.time()
@@ -139,9 +124,7 @@ class CosyVoice:
139
  yield model_output
140
  start_time = time.time()
141
 
142
- @spaces.GPU
143
  def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
144
- self.reload_frontend()
145
  model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
146
  start_time = time.time()
147
  for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
@@ -151,7 +134,6 @@ class CosyVoice:
151
  start_time = time.time()
152
 
153
  class CosyVoice2(CosyVoice):
154
- @spaces.GPU
155
  def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
156
  instruct = True if '-Instruct' in model_dir else False
157
  self.instruct = instruct
 
23
  import spaces
24
 
25
  class CosyVoice:
 
26
  def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
27
  instruct = True if '-Instruct' in model_dir else False
28
  self.instruct = instruct
 
55
  if load_onnx:
56
  self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
57
 
 
58
  def list_avaliable_spks(self):
59
  spks = list(self.frontend.spk2info.keys())
60
  return spks
61
 
 
 
 
 
 
62
  def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
 
63
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
64
  model_input = self.frontend.frontend_sft(i, spk_id)
65
  start_time = time.time()
 
70
  yield model_output
71
  start_time = time.time()
72
 
 
73
  def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
 
74
  prompt_text = self.frontend.text_normalize(prompt_text, split=False)
75
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
76
  if len(i) < 0.5 * len(prompt_text):
 
84
  yield model_output
85
  start_time = time.time()
86
 
 
87
  def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
88
  self.reload_frontend()
89
  if self.frontend.instruct is True:
 
98
  yield model_output
99
  start_time = time.time()
100
 
 
101
  def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
 
102
  assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
103
  if self.frontend.instruct is False:
104
  raise ValueError('{} do not support instruct inference'.format(self.model_dir))
 
113
  yield model_output
114
  start_time = time.time()
115
 
 
116
  def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
 
117
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
118
  model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
119
  start_time = time.time()
 
124
  yield model_output
125
  start_time = time.time()
126
 
 
127
  def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
 
128
  model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
129
  start_time = time.time()
130
  for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
 
134
  start_time = time.time()
135
 
136
  class CosyVoice2(CosyVoice):
 
137
  def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
138
  instruct = True if '-Instruct' in model_dir else False
139
  self.instruct = instruct
cosyvoice/cli/frontend.py CHANGED
@@ -80,11 +80,6 @@ class CosyVoiceFrontEnd:
80
  self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
81
  self.en_tn_model = EnNormalizer()
82
 
83
- def reload_onnx(self):
84
- self.campplus_session = onnxruntime.InferenceSession(self.campplus_model, sess_options=self.option, providers=["CPUExecutionProvider"])
85
- self.speech_tokenizer_session = onnxruntime.InferenceSession(self.speech_tokenizer_model, sess_options=self.option,
86
- providers=["CPUExecutionProvider"])
87
-
88
  def _extract_text_token(self, text):
89
  text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
90
  text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
 
80
  self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
81
  self.en_tn_model = EnNormalizer()
82
 
 
 
 
 
 
83
  def _extract_text_token(self, text):
84
  text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
85
  text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)