import spaces import sys import torch import gradio as gr import opencc # 添加第三方库路径 sys.path.append('third_party/Matcha-TTS') from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.utils.file_utils import load_wav from huggingface_hub import hf_hub_download # 繁简转换 converter = opencc.OpenCC('s2t.json') # 预设参考音频选项 PRESET_AUDIO_OPTIONS = { "ZoengJyutGaai": "asset/张悦楷.wav", "Trump": "asset/特朗普.wav", "Taiyi Zhenren": "asset/太乙真人.wav", "Custom Upload": None } # 加载模型 cosyvoice_base = CosyVoice2( 'ASLP-lab/Cosyvoice2-Yue', load_jit=False, load_trt=False, load_vllm=False, fp16=False ) print('load model 1') cosyvoice_zjg = CosyVoice2( 'ASLP-lab/Cosyvoice2-Yue-ZoengJyutGaai', load_jit=False, load_trt=False, load_vllm=False, fp16=False ) print('load model 2') # cosyvoice_biaobei = CosyVoice2( # 'pretrained_models/CosyVoice2-yue-biaobei', # load_jit=False, load_trt=False, load_vllm=False, fp16=False # ) @spaces.GPU def tts_inference(model_choice, text, preset_audio_choice, custom_audio): # 选择模型和默认音频 if model_choice == "CosyVoice2-Yue-ZoengJyutGaai": model = cosyvoice_zjg prompt_audio = "asset/sg_017_090.wav" elif model_choice == "CosyVoice2-精品女音": model = cosyvoice_base prompt_audio = "asset/F01_中立_20054.wav" elif model_choice == "CosyVoice2-Yue": model = cosyvoice_base if preset_audio_choice == "Custom Upload": if custom_audio is None: return None, "请上传参考音频或选择预设音频" prompt_audio = custom_audio else: prompt_audio = PRESET_AUDIO_OPTIONS[preset_audio_choice] if prompt_audio is None: return None, "请选择有效的参考音频" else: return None, "未知模型" model.model.cuda() # 繁简转换 text = converter.convert(text) prompt_speech_16k = load_wav(prompt_audio, 16000) all_speech = [] for _, j in enumerate( model.inference_instruct2( text, "用粤语说这句话", prompt_speech_16k, stream=False ) ): all_speech.append(j['tts_speech']) concatenated_speech = torch.cat(all_speech, dim=1) audio_numpy = concatenated_speech.squeeze(0).cpu().numpy() sample_rate = model.sample_rate return (sample_rate, audio_numpy), f"successfully generated:{text}" # ---- Gradio Interface ---- with gr.Blocks() as demo: gr.Markdown("# WenetSpeech-Yue TTS") # 添加模型说明文字 gr.Markdown(""" - Cosyvoice2-Yue: Finetuned CosyVoice2 with WenetSpeech-Yue. - Cosyvoice2-Yue-ZoengJyutGaai: Finetuned Cosyvoice2-Yue with The Zoeng Jyut Gaai Story-telling Speech Dataset (https://canclid.github.io/zoengjyutgaai/). """) with gr.Row(): with gr.Column(scale=1): model_choice = gr.Dropdown( ["CosyVoice2-Yue", "CosyVoice2-Yue-ZoengJyutGaai"], label="Select model", value="CosyVoice2-Yue" ) text_input = gr.Textbox(lines=2, label="Input text") preset_audio = gr.Dropdown( choices=list(PRESET_AUDIO_OPTIONS.keys()), value="Custom Upload", label="Select speaker prompt (CosyVoice2-Yue only)" ) custom_audio = gr.Audio(sources=["upload"], type="filepath", label="Upload prompt audio(CosyVoice2-Yue only)") generate_btn = gr.Button("Generate") with gr.Column(scale=1): output_audio = gr.Audio(type="numpy", label="Generated audio") status_text = gr.Textbox(label="Status") generate_btn.click( fn=tts_inference, inputs=[model_choice, text_input, preset_audio, custom_audio], outputs=[output_audio, status_text] ) demo.launch()