Commit
·
9e032ec
1
Parent(s):
c82a118
(wip)debug
Browse files
models.py
CHANGED
@@ -477,6 +477,13 @@ def insert_initial_models():
|
|
477 |
is_open=True,
|
478 |
model_url="https://github.com/FunAudioLLM/CosyVoice",
|
479 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
# Model(
|
481 |
# id="papla-p1",
|
482 |
# name="Papla P1",
|
|
|
477 |
is_open=True,
|
478 |
model_url="https://github.com/FunAudioLLM/CosyVoice",
|
479 |
),
|
480 |
+
Model(
|
481 |
+
id="gpt-sovits-v2",
|
482 |
+
name="GPT-SoVITS v2",
|
483 |
+
model_type=ModelType.TTS,
|
484 |
+
is_open=True,
|
485 |
+
model_url="https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2",
|
486 |
+
),
|
487 |
# Model(
|
488 |
# id="papla-p1",
|
489 |
# name="Papla P1",
|
tts.py
CHANGED
@@ -37,10 +37,6 @@ model_mapping = {
|
|
37 |
# "provider": "elevenlabs",
|
38 |
# "model": "eleven_flash_v2_5",
|
39 |
# },
|
40 |
-
# "cartesia-sonic-2": {
|
41 |
-
# "provider": "cartesia",
|
42 |
-
# "model": "sonic-2",
|
43 |
-
# },
|
44 |
"spark-tts": {
|
45 |
"provider": "spark",
|
46 |
"model": "spark-tts",
|
@@ -65,10 +61,6 @@ model_mapping = {
|
|
65 |
# "provider": "hume",
|
66 |
# "model": "octave",
|
67 |
# },
|
68 |
-
# "megatts3": {
|
69 |
-
# "provider": "megatts3",
|
70 |
-
# "model": "megatts3",
|
71 |
-
# },
|
72 |
# "minimax-02-hd": {
|
73 |
# "provider": "minimax",
|
74 |
# "model": "speech-02-hd",
|
@@ -85,14 +77,14 @@ model_mapping = {
|
|
85 |
"provider": "bilibili",
|
86 |
"model": "index-tts",
|
87 |
},
|
88 |
-
"step-audio-tts-3b": {
|
89 |
-
"provider": "swarmeta_ai",
|
90 |
-
"model": "step-audio-tts-3b",
|
91 |
-
},
|
92 |
"maskgct": {
|
93 |
"provider": "amphion",
|
94 |
"model": "maskgct",
|
95 |
},
|
|
|
|
|
|
|
|
|
96 |
}
|
97 |
url = "https://tts-agi-tts-router-v2.hf.space/tts"
|
98 |
headers = {
|
@@ -266,25 +258,9 @@ def predict_cosyvoice_tts(text, reference_audio_path=None):
|
|
266 |
return result
|
267 |
|
268 |
|
269 |
-
def predict_step_audio_tts_3b(text, reference_audio_path=None):
|
270 |
-
from gradio_client import Client, handle_file,file
|
271 |
-
client = Client("https://swarmeta-ai-step-audio-tts-3b.ms.show/")
|
272 |
-
if not reference_audio_path:
|
273 |
-
raise ValueError("step-audio-tts-3b 需要 reference_audio_path")
|
274 |
-
prompt_audio = handle_file(reference_audio_path)
|
275 |
-
result = client.predict(
|
276 |
-
text=text,
|
277 |
-
prompt_audio=file(reference_audio_path),
|
278 |
-
prompt_text="",
|
279 |
-
api_name="/generate_clone"
|
280 |
-
)
|
281 |
-
print("step-audio-tts-3b result:", result)
|
282 |
-
return result
|
283 |
-
|
284 |
-
|
285 |
def predict_maskgct(text, reference_audio_path=None):
|
286 |
from gradio_client import Client, handle_file
|
287 |
-
client = Client("
|
288 |
if not reference_audio_path:
|
289 |
raise ValueError("maskgct 需要 reference_audio_path")
|
290 |
prompt_wav = handle_file(reference_audio_path)
|
@@ -299,6 +275,31 @@ def predict_maskgct(text, reference_audio_path=None):
|
|
299 |
return result
|
300 |
|
301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
def predict_tts(text, model, reference_audio_path=None):
|
303 |
global client
|
304 |
print(f"Predicting TTS for {model}")
|
@@ -315,10 +316,10 @@ def predict_tts(text, model, reference_audio_path=None):
|
|
315 |
return predict_spark_tts(text, reference_audio_path)
|
316 |
elif model == "cosyvoice-2.0":
|
317 |
return predict_cosyvoice_tts(text, reference_audio_path)
|
318 |
-
elif model == "step-audio-tts-3b":
|
319 |
-
return predict_step_audio_tts_3b(text, reference_audio_path)
|
320 |
elif model == "maskgct":
|
321 |
return predict_maskgct(text, reference_audio_path)
|
|
|
|
|
322 |
|
323 |
if not model in model_mapping:
|
324 |
raise ValueError(f"Model {model} not found")
|
|
|
37 |
# "provider": "elevenlabs",
|
38 |
# "model": "eleven_flash_v2_5",
|
39 |
# },
|
|
|
|
|
|
|
|
|
40 |
"spark-tts": {
|
41 |
"provider": "spark",
|
42 |
"model": "spark-tts",
|
|
|
61 |
# "provider": "hume",
|
62 |
# "model": "octave",
|
63 |
# },
|
|
|
|
|
|
|
|
|
64 |
# "minimax-02-hd": {
|
65 |
# "provider": "minimax",
|
66 |
# "model": "speech-02-hd",
|
|
|
77 |
"provider": "bilibili",
|
78 |
"model": "index-tts",
|
79 |
},
|
|
|
|
|
|
|
|
|
80 |
"maskgct": {
|
81 |
"provider": "amphion",
|
82 |
"model": "maskgct",
|
83 |
},
|
84 |
+
"gpt-sovits-v2": {
|
85 |
+
"provider": "gpt-sovits",
|
86 |
+
"model": "gpt-sovits-v2",
|
87 |
+
},
|
88 |
}
|
89 |
url = "https://tts-agi-tts-router-v2.hf.space/tts"
|
90 |
headers = {
|
|
|
258 |
return result
|
259 |
|
260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
def predict_maskgct(text, reference_audio_path=None):
|
262 |
from gradio_client import Client, handle_file
|
263 |
+
client = Client("amphion/maskgct")
|
264 |
if not reference_audio_path:
|
265 |
raise ValueError("maskgct 需要 reference_audio_path")
|
266 |
prompt_wav = handle_file(reference_audio_path)
|
|
|
275 |
return result
|
276 |
|
277 |
|
278 |
+
def predict_gpt_sovits_v2(text, reference_audio_path=None):
|
279 |
+
from gradio_client import Client, file
|
280 |
+
client = Client("lj1995/GPT-SoVITS-v2")
|
281 |
+
if not reference_audio_path:
|
282 |
+
raise ValueError("GPT-SoVITS-v2 需要 reference_audio_path")
|
283 |
+
result = client.predict(
|
284 |
+
ref_wav_path=file(reference_audio_path),
|
285 |
+
prompt_text="",
|
286 |
+
prompt_language="English",
|
287 |
+
text=text,
|
288 |
+
text_language="English",
|
289 |
+
how_to_cut="Slice once every 4 sentences",
|
290 |
+
top_k=15,
|
291 |
+
top_p=1,
|
292 |
+
temperature=1,
|
293 |
+
ref_free=False,
|
294 |
+
speed=1,
|
295 |
+
if_freeze=False,
|
296 |
+
inp_refs=[],
|
297 |
+
api_name="/get_tts_wav"
|
298 |
+
)
|
299 |
+
print("gpt-sovits-v2 result:", result)
|
300 |
+
return result
|
301 |
+
|
302 |
+
|
303 |
def predict_tts(text, model, reference_audio_path=None):
|
304 |
global client
|
305 |
print(f"Predicting TTS for {model}")
|
|
|
316 |
return predict_spark_tts(text, reference_audio_path)
|
317 |
elif model == "cosyvoice-2.0":
|
318 |
return predict_cosyvoice_tts(text, reference_audio_path)
|
|
|
|
|
319 |
elif model == "maskgct":
|
320 |
return predict_maskgct(text, reference_audio_path)
|
321 |
+
elif model == "gpt-sovits-v2":
|
322 |
+
return predict_gpt_sovits_v2(text, reference_audio_path)
|
323 |
|
324 |
if not model in model_mapping:
|
325 |
raise ValueError(f"Model {model} not found")
|