ASLP-lab's picture
Update app.py (#3)
d7b5dbc verified
raw
history blame
3.44 kB
import spaces
import sys
import torch
import gradio as gr
import opencc
# 添加第三方库路径
sys.path.append('third_party/Matcha-TTS')
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
from huggingface_hub import hf_hub_download
# 繁简转换
converter = opencc.OpenCC('s2t.json')
# 预设参考音频选项
PRESET_AUDIO_OPTIONS = {
"ZoengJyutGaai": "asset/张悦楷.wav",
"Trump": "asset/特朗普.wav",
"Taiyi Zhenren": "asset/太乙真人.wav",
"自定义上传": None
}
# 加载模型
cosyvoice_base = CosyVoice2(
'ASLP-lab/Cosyvoice2-Yue',
load_jit=False, load_trt=False, load_vllm=False, fp16=False
)
print('load model 1')
cosyvoice_zjg = CosyVoice2(
'ASLP-lab/Cosyvoice2-Yue-ZoengJyutGaai',
load_jit=False, load_trt=False, load_vllm=False, fp16=False
)
print('load model 2')
# cosyvoice_biaobei = CosyVoice2(
# 'pretrained_models/CosyVoice2-yue-biaobei',
# load_jit=False, load_trt=False, load_vllm=False, fp16=False
# )
@spaces.GPU
def tts_inference(model_choice, text, preset_audio_choice, prompt_audio):
# 选择模型和默认音频
if model_choice == "CosyVoice2-Yue-ZoengJyutGaai":
model = cosyvoice_zjg
prompt_audio = "asset/sg_017_090.wav"
elif model_choice == "CosyVoice2-精品女音":
model = cosyvoice_base
prompt_audio = "asset/F01_中立_20054.wav"
elif model_choice == "CosyVoice2-Yue":
model = cosyvoice_base
if preset_audio_choice == "自定义上传":
if custom_audio is None:
return None, "请上传参考音频或选择预设音频"
prompt_audio = custom_audio
else:
prompt_audio = PRESET_AUDIO_OPTIONS[preset_audio_choice]
if prompt_audio is None:
return None, "请选择有效的参考音频"
else:
return None, "未知模型"
model.model.cuda()
# 繁简转换
text = converter.convert(text)
prompt_speech_16k = load_wav(prompt_audio, 16000)
all_speech = []
for _, j in enumerate(
model.inference_instruct2(
text, "用粤语说这句话", prompt_speech_16k, stream=False
)
):
all_speech.append(j['tts_speech'])
concatenated_speech = torch.cat(all_speech, dim=1)
audio_numpy = concatenated_speech.squeeze(0).cpu().numpy()
sample_rate = model.sample_rate
return (sample_rate, audio_numpy), f"生成成功:{text}"
# ---- Gradio Interface ----
demo = gr.Interface(
fn=tts_inference,
inputs=[
gr.Dropdown(
["CosyVoice2-Yue", "CosyVoice2-Yue-ZoengJyutGaai"],
# ["CosyVoice2-base", "CosyVoice2-张悦楷粤语评书", "CosyVoice2-精品女音"],
label="select model", value="CosyVoice2-Yue"
),
gr.Textbox(lines=2, label="input text"),
gr.Dropdown(
choices=list(PRESET_AUDIO_OPTIONS.keys()),
value="ZoengJyutGaai",
label="please select audio options(only for CosyVoice2-Yue)"
),
# gr.Audio(source="upload", type="filepath", label="上传参考音频(仅 CosyVoice2-Yue 必需)")
gr.Audio(sources=["upload"], type="filepath", label="upload prompt audio(only for CosyVoice2-Yue)")
],
outputs=[
gr.Audio(type="numpy", label="生成的语音"),
gr.Textbox(label="状态信息")
]
)
demo.launch()