Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
258fb54
1
Parent(s):
b8368df
(wip)debug
Browse files- app.py +11 -1
- cosyvoice/cli/cosyvoice.py +0 -18
- cosyvoice/cli/frontend.py +0 -5
app.py
CHANGED
@@ -86,6 +86,16 @@ def get_cosyvoice():
|
|
86 |
load_trt=load_trt)
|
87 |
return cosyvoice_instance
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
@spaces.GPU
|
90 |
def get_asr():
|
91 |
global asr_model
|
@@ -194,7 +204,7 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
194 |
logging.info('get zero_shot inference request')
|
195 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
196 |
set_all_random_seed(seed)
|
197 |
-
for i in
|
198 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
199 |
elif mode_checkbox_group == 'Cross-lingual Clone':
|
200 |
logging.info('get cross_lingual inference request')
|
|
|
86 |
load_trt=load_trt)
|
87 |
return cosyvoice_instance
|
88 |
|
89 |
+
@spaces.GPU
|
90 |
+
def infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream, speed):
|
91 |
+
cosyvoice = get_cosyvoice()
|
92 |
+
if cosyvoice.frontend.instruct is True:
|
93 |
+
logging.warning('CosyVoice2-0.5B does not support zero-shot inference, please use CosyVoice-300M or CosyVoice-300M-Instruct.')
|
94 |
+
return
|
95 |
+
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
96 |
+
yield i
|
97 |
+
|
98 |
+
|
99 |
@spaces.GPU
|
100 |
def get_asr():
|
101 |
global asr_model
|
|
|
204 |
logging.info('get zero_shot inference request')
|
205 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
206 |
set_all_random_seed(seed)
|
207 |
+
for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
208 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
209 |
elif mode_checkbox_group == 'Cross-lingual Clone':
|
210 |
logging.info('get cross_lingual inference request')
|
cosyvoice/cli/cosyvoice.py
CHANGED
@@ -23,7 +23,6 @@ from cosyvoice.utils.file_utils import logging
|
|
23 |
import spaces
|
24 |
|
25 |
class CosyVoice:
|
26 |
-
@spaces.GPU
|
27 |
def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
|
28 |
instruct = True if '-Instruct' in model_dir else False
|
29 |
self.instruct = instruct
|
@@ -56,18 +55,11 @@ class CosyVoice:
|
|
56 |
if load_onnx:
|
57 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
58 |
|
59 |
-
@spaces.GPU
|
60 |
def list_avaliable_spks(self):
|
61 |
spks = list(self.frontend.spk2info.keys())
|
62 |
return spks
|
63 |
|
64 |
-
@spaces.GPU
|
65 |
-
def reload_frontend(self):
|
66 |
-
self.frontend.reload_onnx()
|
67 |
-
|
68 |
-
@spaces.GPU
|
69 |
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
|
70 |
-
self.reload_frontend()
|
71 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
72 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
73 |
start_time = time.time()
|
@@ -78,9 +70,7 @@ class CosyVoice:
|
|
78 |
yield model_output
|
79 |
start_time = time.time()
|
80 |
|
81 |
-
@spaces.GPU
|
82 |
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
83 |
-
self.reload_frontend()
|
84 |
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
85 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
86 |
if len(i) < 0.5 * len(prompt_text):
|
@@ -94,7 +84,6 @@ class CosyVoice:
|
|
94 |
yield model_output
|
95 |
start_time = time.time()
|
96 |
|
97 |
-
@spaces.GPU
|
98 |
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
99 |
self.reload_frontend()
|
100 |
if self.frontend.instruct is True:
|
@@ -109,9 +98,7 @@ class CosyVoice:
|
|
109 |
yield model_output
|
110 |
start_time = time.time()
|
111 |
|
112 |
-
@spaces.GPU
|
113 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
114 |
-
self.reload_frontend()
|
115 |
assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
|
116 |
if self.frontend.instruct is False:
|
117 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
@@ -126,9 +113,7 @@ class CosyVoice:
|
|
126 |
yield model_output
|
127 |
start_time = time.time()
|
128 |
|
129 |
-
@spaces.GPU
|
130 |
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
|
131 |
-
self.reload_frontend()
|
132 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
133 |
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
|
134 |
start_time = time.time()
|
@@ -139,9 +124,7 @@ class CosyVoice:
|
|
139 |
yield model_output
|
140 |
start_time = time.time()
|
141 |
|
142 |
-
@spaces.GPU
|
143 |
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
|
144 |
-
self.reload_frontend()
|
145 |
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
|
146 |
start_time = time.time()
|
147 |
for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
|
@@ -151,7 +134,6 @@ class CosyVoice:
|
|
151 |
start_time = time.time()
|
152 |
|
153 |
class CosyVoice2(CosyVoice):
|
154 |
-
@spaces.GPU
|
155 |
def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
|
156 |
instruct = True if '-Instruct' in model_dir else False
|
157 |
self.instruct = instruct
|
|
|
23 |
import spaces
|
24 |
|
25 |
class CosyVoice:
|
|
|
26 |
def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
|
27 |
instruct = True if '-Instruct' in model_dir else False
|
28 |
self.instruct = instruct
|
|
|
55 |
if load_onnx:
|
56 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
57 |
|
|
|
58 |
def list_avaliable_spks(self):
|
59 |
spks = list(self.frontend.spk2info.keys())
|
60 |
return spks
|
61 |
|
|
|
|
|
|
|
|
|
|
|
62 |
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
|
|
|
63 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
64 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
65 |
start_time = time.time()
|
|
|
70 |
yield model_output
|
71 |
start_time = time.time()
|
72 |
|
|
|
73 |
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
74 |
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
75 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
76 |
if len(i) < 0.5 * len(prompt_text):
|
|
|
84 |
yield model_output
|
85 |
start_time = time.time()
|
86 |
|
|
|
87 |
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
88 |
self.reload_frontend()
|
89 |
if self.frontend.instruct is True:
|
|
|
98 |
yield model_output
|
99 |
start_time = time.time()
|
100 |
|
|
|
101 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
|
|
102 |
assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
|
103 |
if self.frontend.instruct is False:
|
104 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
|
|
113 |
yield model_output
|
114 |
start_time = time.time()
|
115 |
|
|
|
116 |
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
117 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
118 |
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
|
119 |
start_time = time.time()
|
|
|
124 |
yield model_output
|
125 |
start_time = time.time()
|
126 |
|
|
|
127 |
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
128 |
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
|
129 |
start_time = time.time()
|
130 |
for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
|
|
|
134 |
start_time = time.time()
|
135 |
|
136 |
class CosyVoice2(CosyVoice):
|
|
|
137 |
def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
|
138 |
instruct = True if '-Instruct' in model_dir else False
|
139 |
self.instruct = instruct
|
cosyvoice/cli/frontend.py
CHANGED
@@ -80,11 +80,6 @@ class CosyVoiceFrontEnd:
|
|
80 |
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
|
81 |
self.en_tn_model = EnNormalizer()
|
82 |
|
83 |
-
def reload_onnx(self):
|
84 |
-
self.campplus_session = onnxruntime.InferenceSession(self.campplus_model, sess_options=self.option, providers=["CPUExecutionProvider"])
|
85 |
-
self.speech_tokenizer_session = onnxruntime.InferenceSession(self.speech_tokenizer_model, sess_options=self.option,
|
86 |
-
providers=["CPUExecutionProvider"])
|
87 |
-
|
88 |
def _extract_text_token(self, text):
|
89 |
text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
|
90 |
text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
|
|
|
80 |
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
|
81 |
self.en_tn_model = EnNormalizer()
|
82 |
|
|
|
|
|
|
|
|
|
|
|
83 |
def _extract_text_token(self, text):
|
84 |
text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
|
85 |
text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
|