File size: 3,440 Bytes
bd84a81
f3ca430
 
a305f81
bd84a81
 
 
 
 
 
 
 
a305f81
bd84a81
d0e12a4
bd84a81
 
a3f711d
d7b5dbc
 
 
 
 
 
 
 
bd84a81
7238dfb
 
 
 
 
3898757
 
 
 
 
bd84a81
 
 
 
a3f711d
bd84a81
d7b5dbc
4144748
d7b5dbc
3898757
4144748
 
7238dfb
4144748
d7b5dbc
7238dfb
d7b5dbc
 
 
 
 
 
 
 
f3ca430
bd84a81
4144748
9a3e90a
4144748
 
bd84a81
 
a3f711d
bd84a81
 
 
 
 
 
 
a3f711d
bd84a81
 
 
 
 
 
4144748
 
 
 
 
 
d7b5dbc
4144748
d7b5dbc
 
 
 
 
 
 
4144748
d7b5dbc
 
4144748
 
 
 
 
 
a3f711d
f3ca430
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import spaces
import sys
import torch

import gradio as gr
import opencc

# 添加第三方库路径
sys.path.append('third_party/Matcha-TTS')

from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav

from huggingface_hub import hf_hub_download

# 繁简转换
converter = opencc.OpenCC('s2t.json')

# 预设参考音频选项
PRESET_AUDIO_OPTIONS = {
    "ZoengJyutGaai": "asset/张悦楷.wav",
    "Trump": "asset/特朗普.wav",
    "Taiyi Zhenren": "asset/太乙真人.wav",
    "自定义上传": None
}

# 加载模型
cosyvoice_base = CosyVoice2(
    'ASLP-lab/Cosyvoice2-Yue',
    load_jit=False, load_trt=False, load_vllm=False, fp16=False
)
print('load model 1')
cosyvoice_zjg = CosyVoice2(
    'ASLP-lab/Cosyvoice2-Yue-ZoengJyutGaai',
    load_jit=False, load_trt=False, load_vllm=False, fp16=False
)
print('load model 2')
# cosyvoice_biaobei = CosyVoice2(
#     'pretrained_models/CosyVoice2-yue-biaobei',
#     load_jit=False, load_trt=False, load_vllm=False, fp16=False
# )

@spaces.GPU 
def tts_inference(model_choice, text, preset_audio_choice, prompt_audio):
    # 选择模型和默认音频
    if model_choice == "CosyVoice2-Yue-ZoengJyutGaai":
        model = cosyvoice_zjg
        prompt_audio = "asset/sg_017_090.wav"
    elif model_choice == "CosyVoice2-精品女音":
        model = cosyvoice_base
        prompt_audio = "asset/F01_中立_20054.wav"
    elif model_choice == "CosyVoice2-Yue":
        model = cosyvoice_base
        if preset_audio_choice == "自定义上传":
            if custom_audio is None:
                return None, "请上传参考音频或选择预设音频"
            prompt_audio = custom_audio
        else:
            prompt_audio = PRESET_AUDIO_OPTIONS[preset_audio_choice]
            if prompt_audio is None:
                return None, "请选择有效的参考音频"
    else:
        return None, "未知模型"
    
    model.model.cuda()

    # 繁简转换
    text = converter.convert(text)
    prompt_speech_16k = load_wav(prompt_audio, 16000)

    all_speech = []
    for _, j in enumerate(
        model.inference_instruct2(
            text, "用粤语说这句话", prompt_speech_16k, stream=False
        )
    ):
        all_speech.append(j['tts_speech'])

    concatenated_speech = torch.cat(all_speech, dim=1)
    audio_numpy = concatenated_speech.squeeze(0).cpu().numpy()
    sample_rate = model.sample_rate

    return (sample_rate, audio_numpy), f"生成成功:{text}"


# ---- Gradio Interface ----
demo = gr.Interface(
    fn=tts_inference,
    inputs=[
        gr.Dropdown(
            ["CosyVoice2-Yue", "CosyVoice2-Yue-ZoengJyutGaai"],
            # ["CosyVoice2-base", "CosyVoice2-张悦楷粤语评书", "CosyVoice2-精品女音"],
            label="select model", value="CosyVoice2-Yue"
        ),
        gr.Textbox(lines=2, label="input text"),
        gr.Dropdown(
            choices=list(PRESET_AUDIO_OPTIONS.keys()),
            value="ZoengJyutGaai",
            label="please select audio options(only for CosyVoice2-Yue)"
        ),
        # gr.Audio(source="upload", type="filepath", label="上传参考音频(仅 CosyVoice2-Yue 必需)")
        gr.Audio(sources=["upload"], type="filepath", label="upload prompt audio(only for CosyVoice2-Yue)")
    ],
    outputs=[
        gr.Audio(type="numpy", label="生成的语音"),
        gr.Textbox(label="状态信息")
    ]
)

demo.launch()