Spaces:

kafka23
/

LLASA-ZZZ-Genshin-HSR

Sleeping

File size: 5,772 Bytes


import spaces
import gradio as gr
import torch
import soundfile as sf
from transformers import AutoTokenizer, AutoModelForCausalLM
from xcodec2.modeling_xcodec2 import XCodec2Model
import tempfile
import json

device = "cuda" if torch.cuda.is_available() else "cpu"

####################
#  全局加载模型
####################
llasa_3b = "HKUSTAudio/Llasa-1B-multi-speakers-genshin-zh-en-ja-ko"
print("Loading tokenizer & model ...")
tokenizer = AutoTokenizer.from_pretrained(llasa_3b)
model = AutoModelForCausalLM.from_pretrained(llasa_3b)
model.eval().to(device)

print("Loading XCodec2Model ...")
codec_model_path = "HKUSTAudio/xcodec2"
Codec_model = XCodec2Model.from_pretrained(codec_model_path)
Codec_model.eval().to(device)

print("Models loaded.")

prompt_text_dict = json.load(open("Reference_Voice/text.json", "r", encoding="utf-8"))

####################
#  推理用函数
####################
def extract_speech_ids(speech_tokens_str):
    """
    将类似 <|s_23456|> 还原为 int 23456
    """
    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith("<|s_") and token_str.endswith("|>"):
            num_str = token_str[4:-2]
            num = int(num_str)
            speech_ids.append(num)
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids

def ids_to_speech_tokens(speech_ids):
 
    speech_tokens_str = []
    for speech_id in speech_ids:
        speech_tokens_str.append(f"<|s_{speech_id}|>")
    return speech_tokens_str

@spaces.GPU
def text2speech(target_text, game, speaker):
    """
    将文本转为语音波形，并返回音频文件路径
    """

    prompt_wav, sr = sf.read(f"Reference_Voice/{game}/{speaker}/audio.mp3")
    prompt_wav = torch.from_numpy(prompt_wav).float().unsqueeze(0) 
    if prompt_wav.ndim == 3:
        prompt_wav = prompt_wav.mean(dim=2)

    prompt_text = prompt_text_dict[game][speaker]

    input_text = prompt_text + " " + target_text
    # read text file in the same directory with name text


    with torch.no_grad():

        # Encode the prompt wav
        vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
        print("Prompt Vq Code Shape:", vq_code_prompt.shape )   

        vq_code_prompt = vq_code_prompt[0,0,:]
        # Convert int 12345 to token <|s_12345|>
        speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)

        # 在输入文本前后拼接提示token
        formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"

        # Tokenize the text and the speech prefix
        chat = [
            {"role": "user", "content": "Convert the text to speech:" + formatted_text},
            {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)}
        ]

        input_ids = tokenizer.apply_chat_template(
            chat, 
            tokenize=True, 
            return_tensors='pt', 
            continue_final_message=True
        )
        input_ids = input_ids.to(device)
        speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')

        # Generate the speech autoregressively
        outputs = model.generate(
            input_ids,
            max_length=2048,  # We trained our model with a max length of 2048
            eos_token_id= speech_end_id ,
            do_sample=True,
            top_p=1,           
            temperature=0.8,
        )
        # Extract the speech tokens
        generated_ids = outputs[0][input_ids.shape[1]-len(speech_ids_prefix):-1]

        speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)   

        # Convert  token <|s_23456|> to int 23456 
        speech_tokens = extract_speech_ids(speech_tokens)

        if torch.cuda.is_available():
            # Move speech tokens to GPU
            speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
        else:
            # If CUDA is not available, keep on CPU
            speech_tokens = torch.tensor(speech_tokens).unsqueeze(0).unsqueeze(0)

        # Decode the speech tokens to speech waveform
        gen_wav = Codec_model.decode_code(speech_tokens) 

    # 获取音频数据和采样率
    audio = gen_wav[0, 0, :].cpu().numpy()
    sample_rate = 16000

    # 将音频保存到临时文件
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
        sf.write(tmpfile.name, audio, sample_rate)
        audio_path = tmpfile.name

    return audio_path

####################
#  Gradio 界面
####################
game_choices = [
    "HonkaiSR",
    "Zenless",
    "Genshin"
]

speaker_game_dict = {
    "HonkaiSR": [
        "Kafka", "Firefly", "Silverwolf"
    ],
    "Zenless": [
        "Yixuan", "Miyabi", "Jane"
    ],
    "Genshin": [
        "Mavuika", "Navia", "Kokomi", "Furina", "Yoimiya"
    ]
}
#["puck", "kore"]

if __name__ == "__main__":
    with gr.Blocks() as demo:
        gr.Markdown("## Text to Speech Generation")
        with gr.Row():
            game = gr.Dropdown(label="Game", choices=game_choices, value="HonkaiSR")
            speaker = gr.Dropdown(label="Speaker", choices=speaker_game_dict[game.value], value="", allow_custom_value=True)
        
        target_text = gr.Textbox(label="Target Text", placeholder="Enter the text you want to convert to speech.")
        output_audio = gr.Audio(label="Generated Audio", type="filepath")

        def update_speakers(game):
            return speaker_game_dict[game]

        game.change(update_speakers, inputs=game, outputs=speaker)

        text2speech_button = gr.Button("Generate Speech")
        text2speech_button.click(text2speech, inputs=[target_text, game, speaker], outputs=output_audio)

    demo.launch()