File size: 5,772 Bytes
6992dff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd1a2cc
6992dff
 
 
 
 
 
9166caa
bab7d1c
 
6992dff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd1a2cc
6992dff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21e13bb
 
 
 
 
 
6992dff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bab7d1c
6992dff
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179

import spaces
import gradio as gr
import torch
import soundfile as sf
from transformers import AutoTokenizer, AutoModelForCausalLM
from xcodec2.modeling_xcodec2 import XCodec2Model
import tempfile
import json

device = "cuda" if torch.cuda.is_available() else "cpu"

####################
#  全局加载模型
####################
llasa_3b = "HKUSTAudio/Llasa-1B-multi-speakers-genshin-zh-en-ja-ko"
print("Loading tokenizer & model ...")
tokenizer = AutoTokenizer.from_pretrained(llasa_3b)
model = AutoModelForCausalLM.from_pretrained(llasa_3b)
model.eval().to(device)

print("Loading XCodec2Model ...")
codec_model_path = "HKUSTAudio/xcodec2"
Codec_model = XCodec2Model.from_pretrained(codec_model_path)
Codec_model.eval().to(device)

print("Models loaded.")

prompt_text_dict = json.load(open("Reference_Voice/text.json", "r", encoding="utf-8"))

####################
#  推理用函数
####################
def extract_speech_ids(speech_tokens_str):
    """
    将类似 <|s_23456|> 还原为 int 23456
    """
    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith("<|s_") and token_str.endswith("|>"):
            num_str = token_str[4:-2]
            num = int(num_str)
            speech_ids.append(num)
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids

def ids_to_speech_tokens(speech_ids):
 
    speech_tokens_str = []
    for speech_id in speech_ids:
        speech_tokens_str.append(f"<|s_{speech_id}|>")
    return speech_tokens_str

@spaces.GPU
def text2speech(target_text, game, speaker):
    """
    将文本转为语音波形,并返回音频文件路径
    """

    prompt_wav, sr = sf.read(f"Reference_Voice/{game}/{speaker}/audio.mp3")
    prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0) 
    if prompt_wav.ndim == 3:
        prompt_wav = prompt_wav.mean(dim=2)

    prompt_text = prompt_text_dict[game][speaker]

    input_text = prompt_text + " " + target_text
    # read text file in the same directory with name text


    with torch.no_grad():

        # Encode the prompt wav
        vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
        print("Prompt Vq Code Shape:", vq_code_prompt.shape )   

        vq_code_prompt = vq_code_prompt[0,0,:]
        # Convert int 12345 to token <|s_12345|>
        speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)

        # 在输入文本前后拼接提示token
        formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"

        # Tokenize the text and the speech prefix
        chat = [
            {"role": "user", "content": "Convert the text to speech:" + formatted_text},
            {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)}
        ]

        input_ids = tokenizer.apply_chat_template(
            chat, 
            tokenize=True, 
            return_tensors='pt', 
            continue_final_message=True
        )
        input_ids = input_ids.to(device)
        speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')

        # Generate the speech autoregressively
        outputs = model.generate(
            input_ids,
            max_length=2048,  # We trained our model with a max length of 2048
            eos_token_id= speech_end_id ,
            do_sample=True,
            top_p=1,           
            temperature=0.8,
        )
        # Extract the speech tokens
        generated_ids = outputs[0][input_ids.shape[1]-len(speech_ids_prefix):-1]

        speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)   

        # Convert  token <|s_23456|> to int 23456 
        speech_tokens = extract_speech_ids(speech_tokens)

        if torch.cuda.is_available():
            # Move speech tokens to GPU
            speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
        else:
            # If CUDA is not available, keep on CPU
            speech_tokens = torch.tensor(speech_tokens).unsqueeze(0).unsqueeze(0)

        # Decode the speech tokens to speech waveform
        gen_wav = Codec_model.decode_code(speech_tokens) 

    # 获取音频数据和采样率
    audio = gen_wav[0, 0, :].cpu().numpy()
    sample_rate = 16000

    # 将音频保存到临时文件
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
        sf.write(tmpfile.name, audio, sample_rate)
        audio_path = tmpfile.name

    return audio_path

####################
#  Gradio 界面
####################
game_choices = [
    "HonkaiSR",
    "Zenless",
    "Genshin"
]

speaker_game_dict = {
    "HonkaiSR": [
        "Kafka", "Firefly", "Silverwolf"
    ],
    "Zenless": [
        "Yixuan", "Miyabi", "Jane"
    ],
    "Genshin": [
        "Mavuika", "Navia", "Kokomi", "Furina", "Yoimiya"
    ]
}
#["puck", "kore"]

if __name__ == "__main__":
    with gr.Blocks() as demo:
        gr.Markdown("## Text to Speech Generation")
        with gr.Row():
            game = gr.Dropdown(label="Game", choices=game_choices, value="HonkaiSR")
            speaker = gr.Dropdown(label="Speaker", choices=speaker_game_dict[game.value], value="", allow_custom_value=True)
        
        target_text = gr.Textbox(label="Target Text", placeholder="Enter the text you want to convert to speech.")
        output_audio = gr.Audio(label="Generated Audio", type="filepath")

        def update_speakers(game):
            return speaker_game_dict[game]

        game.change(update_speakers, inputs=game, outputs=speaker)

        text2speech_button = gr.Button("Generate Speech")
        text2speech_button.click(text2speech, inputs=[target_text, game, speaker], outputs=output_audio)

    demo.launch()