Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,942 Bytes
bd84a81 f3ca430 a305f81 bd84a81 a305f81 bd84a81 d0e12a4 bd84a81 a3f711d d7b5dbc ad7b918 d7b5dbc bd84a81 7238dfb 3898757 bd84a81 a3f711d bd84a81 75b242f 4144748 d7b5dbc 3898757 4144748 7238dfb 4144748 d7b5dbc 7238dfb ad7b918 d7b5dbc f3ca430 bd84a81 4144748 9a3e90a 4144748 bd84a81 a3f711d bd84a81 a3f711d bd84a81 2711fc1 bd84a81 4144748 b8296c5 ad770a0 b8296c5 f75ff1f b8296c5 f75ff1f b8296c5 f75ff1f b8296c5 f75ff1f b8296c5 a3f711d f3ca430 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import spaces
import sys
import torch
import gradio as gr
import opencc
# 添加第三方库路径
sys.path.append('third_party/Matcha-TTS')
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
from huggingface_hub import hf_hub_download
# 繁简转换
converter = opencc.OpenCC('s2t.json')
# 预设参考音频选项
PRESET_AUDIO_OPTIONS = {
"ZoengJyutGaai": "asset/张悦楷.wav",
"Trump": "asset/特朗普.wav",
"Taiyi Zhenren": "asset/太乙真人.wav",
"Custom Upload": None
}
# 加载模型
cosyvoice_base = CosyVoice2(
'ASLP-lab/Cosyvoice2-Yue',
load_jit=False, load_trt=False, load_vllm=False, fp16=False
)
print('load model 1')
cosyvoice_zjg = CosyVoice2(
'ASLP-lab/Cosyvoice2-Yue-ZoengJyutGaai',
load_jit=False, load_trt=False, load_vllm=False, fp16=False
)
print('load model 2')
# cosyvoice_biaobei = CosyVoice2(
# 'pretrained_models/CosyVoice2-yue-biaobei',
# load_jit=False, load_trt=False, load_vllm=False, fp16=False
# )
@spaces.GPU
def tts_inference(model_choice, text, preset_audio_choice, custom_audio):
# 选择模型和默认音频
if model_choice == "CosyVoice2-Yue-ZoengJyutGaai":
model = cosyvoice_zjg
prompt_audio = "asset/sg_017_090.wav"
elif model_choice == "CosyVoice2-精品女音":
model = cosyvoice_base
prompt_audio = "asset/F01_中立_20054.wav"
elif model_choice == "CosyVoice2-Yue":
model = cosyvoice_base
if preset_audio_choice == "Custom Upload":
if custom_audio is None:
return None, "请上传参考音频或选择预设音频"
prompt_audio = custom_audio
else:
prompt_audio = PRESET_AUDIO_OPTIONS[preset_audio_choice]
if prompt_audio is None:
return None, "请选择有效的参考音频"
else:
return None, "未知模型"
model.model.cuda()
# 繁简转换
text = converter.convert(text)
prompt_speech_16k = load_wav(prompt_audio, 16000)
all_speech = []
for _, j in enumerate(
model.inference_instruct2(
text, "用粤语说这句话", prompt_speech_16k, stream=False
)
):
all_speech.append(j['tts_speech'])
concatenated_speech = torch.cat(all_speech, dim=1)
audio_numpy = concatenated_speech.squeeze(0).cpu().numpy()
sample_rate = model.sample_rate
return (sample_rate, audio_numpy), f"successfully generated:{text}"
# ---- Gradio Interface ----
with gr.Blocks() as demo:
gr.Markdown("# WenetSpeech-Yue TTS")
# 添加模型说明文字
gr.Markdown("""
- Cosyvoice2-Yue: Finetuned CosyVoice2 with WenetSpeech-Yue.
- Cosyvoice2-Yue-ZoengJyutGaai: Finetuned Cosyvoice2-Yue with The Zoeng Jyut Gaai Story-telling Speech Dataset (https://canclid.github.io/zoengjyutgaai/).
""")
with gr.Row():
with gr.Column(scale=1):
model_choice = gr.Dropdown(
["CosyVoice2-Yue", "CosyVoice2-Yue-ZoengJyutGaai"],
label="Select model", value="CosyVoice2-Yue"
)
text_input = gr.Textbox(lines=2, label="Input text")
preset_audio = gr.Dropdown(
choices=list(PRESET_AUDIO_OPTIONS.keys()),
value="Custom Upload",
label="Select speaker prompt (CosyVoice2-Yue only)"
)
custom_audio = gr.Audio(sources=["upload"], type="filepath", label="Upload prompt audio(CosyVoice2-Yue only)")
generate_btn = gr.Button("Generate")
with gr.Column(scale=1):
output_audio = gr.Audio(type="numpy", label="Generated audio")
status_text = gr.Textbox(label="Status")
generate_btn.click(
fn=tts_inference,
inputs=[model_choice, text_input, preset_audio, custom_audio],
outputs=[output_audio, status_text]
)
demo.launch()
|