Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import sys | |
import torch | |
import gradio as gr | |
import opencc | |
# 添加第三方库路径 | |
sys.path.append('third_party/Matcha-TTS') | |
from cosyvoice.cli.cosyvoice import CosyVoice2 | |
from cosyvoice.utils.file_utils import load_wav | |
from huggingface_hub import hf_hub_download | |
# 繁简转换 | |
converter = opencc.OpenCC('s2t.json') | |
# 预设参考音频选项 | |
PRESET_AUDIO_OPTIONS = { | |
"ZoengJyutGaai": "asset/张悦楷.wav", | |
"Trump": "asset/特朗普.wav", | |
"Taiyi Zhenren": "asset/太乙真人.wav", | |
"自定义上传": None | |
} | |
# 加载模型 | |
cosyvoice_base = CosyVoice2( | |
'ASLP-lab/Cosyvoice2-Yue', | |
load_jit=False, load_trt=False, load_vllm=False, fp16=False | |
) | |
print('load model 1') | |
cosyvoice_zjg = CosyVoice2( | |
'ASLP-lab/Cosyvoice2-Yue-ZoengJyutGaai', | |
load_jit=False, load_trt=False, load_vllm=False, fp16=False | |
) | |
print('load model 2') | |
# cosyvoice_biaobei = CosyVoice2( | |
# 'pretrained_models/CosyVoice2-yue-biaobei', | |
# load_jit=False, load_trt=False, load_vllm=False, fp16=False | |
# ) | |
def tts_inference(model_choice, text, preset_audio_choice, prompt_audio): | |
# 选择模型和默认音频 | |
if model_choice == "CosyVoice2-Yue-ZoengJyutGaai": | |
model = cosyvoice_zjg | |
prompt_audio = "asset/sg_017_090.wav" | |
elif model_choice == "CosyVoice2-精品女音": | |
model = cosyvoice_base | |
prompt_audio = "asset/F01_中立_20054.wav" | |
elif model_choice == "CosyVoice2-Yue": | |
model = cosyvoice_base | |
if preset_audio_choice == "自定义上传": | |
if custom_audio is None: | |
return None, "请上传参考音频或选择预设音频" | |
prompt_audio = custom_audio | |
else: | |
prompt_audio = PRESET_AUDIO_OPTIONS[preset_audio_choice] | |
if prompt_audio is None: | |
return None, "请选择有效的参考音频" | |
else: | |
return None, "未知模型" | |
model.model.cuda() | |
# 繁简转换 | |
text = converter.convert(text) | |
prompt_speech_16k = load_wav(prompt_audio, 16000) | |
all_speech = [] | |
for _, j in enumerate( | |
model.inference_instruct2( | |
text, "用粤语说这句话", prompt_speech_16k, stream=False | |
) | |
): | |
all_speech.append(j['tts_speech']) | |
concatenated_speech = torch.cat(all_speech, dim=1) | |
audio_numpy = concatenated_speech.squeeze(0).cpu().numpy() | |
sample_rate = model.sample_rate | |
return (sample_rate, audio_numpy), f"生成成功:{text}" | |
# ---- Gradio Interface ---- | |
demo = gr.Interface( | |
fn=tts_inference, | |
inputs=[ | |
gr.Dropdown( | |
["CosyVoice2-Yue", "CosyVoice2-Yue-ZoengJyutGaai"], | |
# ["CosyVoice2-base", "CosyVoice2-张悦楷粤语评书", "CosyVoice2-精品女音"], | |
label="select model", value="CosyVoice2-Yue" | |
), | |
gr.Textbox(lines=2, label="input text"), | |
gr.Dropdown( | |
choices=list(PRESET_AUDIO_OPTIONS.keys()), | |
value="ZoengJyutGaai", | |
label="please select audio options(only for CosyVoice2-Yue)" | |
), | |
# gr.Audio(source="upload", type="filepath", label="上传参考音频(仅 CosyVoice2-Yue 必需)") | |
gr.Audio(sources=["upload"], type="filepath", label="upload prompt audio(only for CosyVoice2-Yue)") | |
], | |
outputs=[ | |
gr.Audio(type="numpy", label="生成的语音"), | |
gr.Textbox(label="状态信息") | |
] | |
) | |
demo.launch() | |