Spaces:

ASLP-lab
/

OSUM-EChat

Running on Zero

File size: 100,706 Bytes

841f290

import logging
import os
from typing import Dict, List, Optional, Union
import torchaudio
import torch
from peft import LoraConfig, TaskType, get_peft_model
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList, StoppingCriteriaList

from patches.cumstom_stop_criteria import InterruptStopper, S2SStopCriteria, MaxTokenStopper
from patches.custom_speech_ngram_blocking import SpeechOnlyNGramBlockingLogitsProcessor, OSUM_chat_LogitsProcessor
from wenet.osum_echat.utils4llmasr import make_streaming_mode_from_s2s, do_embedding_for_two_embeds
from wenet.transformer.encoder import TransformerEncoder, TransformerEncoder2
from wenet.osum_echat.utils4llmasr import *
from gxl_ai_utils.utils import utils_file

from wenet.osum_echat.downsampler import get_downsampler, osum_echat2Conv1dSubsampling
from wenet.transformer.swish import New_gelu4npu
from wenet.utils.mask import make_pad_mask


class LLMASR_Model(nn.Module):
    def __init__(self,
                 encoder,
                 encoder_output_dim,
                 llm_path,
                 lora=True, lora_alpha=32, lora_rank=8, lora_dropout=0.1,
                 is_inference=False,
                 downsample_rate=1,
                 adapter_type='osum_echat2',
                 speech_token_num=0,
                 train_speech_out=False):
        """"""
        super().__init__()
        utils_file.logging_limit_print(f"instruct_version: LLMASR_Model init, is_inference={is_inference}, downsample_rate={downsample_rate}, adapter_type={adapter_type}, speech_token_num={speech_token_num}, train_speech_out={train_speech_out}")
        self.downsample_rate = downsample_rate

        self.encoder = encoder
        self.ln_speech = nn.LayerNorm(encoder_output_dim)

        # 连接层, 51.6M
        if adapter_type == 'osum_echat':
            self.speech_transformer = TransformerEncoder(
                input_size=encoder_output_dim,
                output_size=encoder_output_dim,
                attention_heads=4,
                linear_units=2560,
                num_blocks=4,
                dropout_rate=0.1,
                positional_dropout_rate=0.1,
                attention_dropout_rate=0.0,
                input_layer="linear",
                pos_enc_layer_type="abs_pos",
                normalize_before=True
            )
        else:
            self.speech_transformer = None

        self.llama_model = AutoModelForCausalLM.from_pretrained(
            llm_path,
            # torch_dtype=torch.float32 if is_inference else torch.float16,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            output_hidden_states=True,
        )
        self.s2s_stop_criteria = None
        self.max_token_criteria_list = None

        self.max_length = 4000
        self.min_length = 1
        self.num_beams = 4
        self.do_sample = True
        self.top_p = 0.9
        self.top_k = 5
        self.repetition_penalty = 1.05
        self.length_penalty = 1.0
        self.temperature = 1.0
        self.IGNORE_ID = -100

        # lora
        self.lora = lora
        if lora:
            utils_file.logging_limit_print("OSUM-EChat： 使用lora了")
            # target_modules = ['w_pack', 'o_proj', 'gate_proj', 'down_proj']
            target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'down_proj']
            self.peft_config = LoraConfig(
                task_type=TaskType.CAUSAL_LM,
                inference_mode=is_inference,
                r=lora_rank,
                lora_alpha=lora_alpha,
                lora_dropout=lora_dropout,
                target_modules=target_modules,
            )
            self.llama_model = get_peft_model(self.llama_model, self.peft_config)

        # tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            llm_path, use_fast=False, trust_remote_code=True)
        """
        设置分词器的pad_token和padding的方向。
        """
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.tokenizer.padding_side = "right"
        self.eos_token_id = self.tokenizer.eos_token_id

        if hasattr(self.llama_model.config, 'hidden_size'):
            utils_file.logging_limit_print(
                f"self.llama_model.config.hidden_size: {self.llama_model.config.hidden_size}")
            if adapter_type == 'osum_echat2':
                self.down_sample_2 = osum_echat2Conv1dSubsampling(encoder_output_dim, self.llama_model.config.hidden_size)
            elif adapter_type == 'osum_echat':
                self.down_sample_2 = get_downsampler(downsample_rate, encoder_output_dim)
                self.speech_llama_proj = nn.Linear(
                    encoder_output_dim, self.llama_model.config.hidden_size)
        else:
            raise NotImplementedError("self.llama_model.config.hidden_size not exist")

        self.embed_tokens = self.llama_model.model.model.embed_tokens if self.lora else self.llama_model.model.embed_tokens
        self.lm_head = self.llama_model.model.lm_head if self.lora else self.llama_model.lm_head
        self.llm_vocab_size  = self.lm_head.weight.shape[0]
        self.speech_token_num = speech_token_num
        # init speech token module
        if speech_token_num > 0:
            utils_file.logging_info(f'OSUM-EChat： 进行语音token生成任务， speech_token_num: {speech_token_num}')
            self.speech_token_emded = torch.nn.Embedding(speech_token_num + 2, self.llama_model.config.hidden_size)
            self.speech_head = torch.nn.Linear(self.llama_model.config.hidden_size, speech_token_num)
        else:
            # 不做任何处理
            self.speech_head = nn.Identity()
            self.speech_token_emded = nn.Identity()
            self.speech_model = nn.Identity()
        self.train_speech_out = train_speech_out
        utils_file.logging_info(f'OSUM-EChat： 是否进行语音输出训练：{self.train_speech_out}')
        self.loss_fct = CrossEntropyLoss(reduction='mean')
        self.unk_token_id = 7672 # &&对应的id
        self.add_embed_head = True
        self.init_custom_speech_repetition_penalty()
        self.init_custom_stop_criteria()

    def set_task_type(self, task_type: str):
        """设置任务类型，用于设置生成的初始类型
        Args:
            task_type (str): 任务类型，从("ASR", "TTS", "S2S")选择
        """
        assert task_type in ("ASR", "TTS", "S2S")
        if task_type == "ASR":
            self.llama_model.text_phase = True
        elif task_type == "TTS":
            self.llama_model.text_phase = False
        elif task_type == "S2S":
            self.llama_model.text_phase = True

    def do_add_speech_embed_head(self):
        if self.add_embed_head:
            self.llama_model.speech_token_emded = self.speech_token_emded.to(torch.bfloat16)
            self.llama_model.speech_head = self.speech_head.to(torch.bfloat16)
            # self.llama_model.speech_token_emded = self.speech_token_emded.to(torch.bfloat16)
            # self.llama_model.speech_head = self.speech_head.to(torch.bfloat16) # 带lora的时候用
            self.add_embed_head = False


    def init_custom_speech_repetition_penalty(self):
        """

        """
        self.s2s_repetition_penalty = LogitsProcessorList()
        # self.speech_repetition_penalty = SpeechOnlyRepetitionPenaltyLogitsProcessor(speech_token_num=4097, penalty=1.5)
        self.speech_repetition_penalty = SpeechOnlyNGramBlockingLogitsProcessor(speech_token_num=4097, repeat_times=5,
                                                                                special_token_repeat_times_dict={
                                                                                    1446: 10})
        self.osum_chat_logit_processor1 = OSUM_chat_LogitsProcessor([99119, 1808, 7863], [102185, 17714, 31252])
        self.s2s_repetition_penalty.append(self.osum_chat_logit_processor1)
        self.s2s_repetition_penalty.append(self.speech_repetition_penalty)
        self.llama_model.speech_repetition_penalty = self.speech_repetition_penalty

    def init_custom_stop_criteria(self):
        """
        创建需要的stop criteria
        1. 对于t2t任务，遇到text_eos停止
        2. 对于t2s任务，遇到speech_eos停止
        3. 对于s2s任务，遇到speech_eos停止
        同时要取消原本的停止条件
        if generation_config._eos_token_tensor is not None:
        取消 generation_config._eos_token_tensor 的停止，尝试直接给一个大于vocb_size的eos_token
        """
        self.interrupt = InterruptStopper()
        self.s2s_stop_criteria = StoppingCriteriaList()
        self.s2s_stop_criteria.append(S2SStopCriteria(text_eos_id=151645, speech_eos_id=self.speech_token_num - 1))
        self.s2s_stop_criteria.append(MaxTokenStopper(2000))
        self.s2s_stop_criteria.append(self.interrupt)

    def get_label_embedding(self, labels, labels_lengths, unk_id=7672):
        """"""
        labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
        unk_mask = (labels == unk_id)  # B, L
        labels_pad_mask = labels_pad_mask | unk_mask  #
        labels = labels.masked_fill(labels_pad_mask, 0)
        labels_embeds = self.embed_tokens(labels)
        labels_target = labels.masked_fill(labels_pad_mask, self.IGNORE_ID)  # B, L
        labels_mask = ~labels_pad_mask
        return labels_embeds, labels_target, labels_mask

    def get_speech_token_label_embedding(self, speech_token_labels, speech_tokens_length):
        """"""
        speech_tokens_pad_mask = make_pad_mask(speech_tokens_length)  # B, L
        speech_token_labels = speech_token_labels.masked_fill(speech_tokens_pad_mask, 0)
        speech_token_labels_embeds = self.speech_token_emded(speech_token_labels)
        # utils_file.logging_limit_print(f'进行speech_token_labels修改，修改前 speech_token_labels',
        #    speech_token_labels.shape, speech_token_labels[0][-1], speech_token_labels[0][0])
        speech_token_labels = speech_token_labels + self.llm_vocab_size
        # utils_file.logging_limit_print(f'进行speech_token_labels修改，修改后 speech_token_labels',
        #    speech_token_labels.shape, speech_token_labels[0][-1], speech_token_labels[0][0])
        speech_token_labels_target = speech_token_labels.masked_fill(speech_tokens_pad_mask, self.IGNORE_ID)  # B, L
        speech_token_labels_mask = ~speech_tokens_pad_mask
        return speech_token_labels_embeds, speech_token_labels_target, speech_token_labels_mask

    def _get_embedding_for_history(self, history_batch, device):
        """
        prompt_patern1,prompt,history, wav, prompt_patern2,txt,answer_wav,
        historcy_batch的内容是：
        [   big_embed,
            [

                {wav: feat （L，D:80)，->过encoder+link ,得到（L1, 2048)
                 txt: labels (L，), ->labels_embeds = self.embed_tokens(labels) -> (L2, 2048)， 带txt eos
                },->(L1+L2, 2048)
                {wav: feat （L，D)，
                 txt: labels (L，),
                },->(L3+L4, 2048)
            ]-> (L1+L2+L3+L4, 2048)，len:L1+L2+L3+L4
            [],->(1, 2048) ,len:0
            [
                {wav: feat （L，D)，
                 txt: labels (L，),
                },
            ],-> (L1+L2, 2048),len:L1+L2
        ]
        将每一条的历史信息的embedding拼接起来,如果有空历史信息，则用0pad, 最后得到pad后的history_embedding(B, L, D), history_lens(B)
        Args:
            history_batch:
            device:

        Returns:
            history_embedding: B, L, D
            history_lens: B

        """
        assistant_start ="<|im_end|>\n<|im_start|>assistant\n"
        assistant_start_id = self.tokenizer([assistant_start], return_tensors="pt"
                                         )['input_ids'].to(device)
        assistant_start_embedding = self.embed_tokens(assistant_start_id.squeeze(0))
        assistant_end ="<|im_end|>\n"
        assistant_end_id = self.tokenizer([assistant_end], return_tensors="pt"
                                       )['input_ids'].to(device)
        assistant_end_embedding = self.embed_tokens(assistant_end_id.squeeze(0))
        user_start = "<|im_start|>user\n"
        user_start_id = self.tokenizer([user_start], return_tensors="pt"
                                        )['input_ids'].to(device)
        user_start_embedding = self.embed_tokens(user_start_id.squeeze(0))
        user_end ="<|im_end|>\n"
        user_end_id = self.tokenizer([user_end], return_tensors="pt"
                                      )['input_ids'].to(device)
        user_end_embedding = self.embed_tokens(user_end_id.squeeze(0))
        batch_embeddings = []
        history_lens = []
        # 判断是否所有样本都没有历史
        if all(len(history) == 0 for history in history_batch):
            return None, None

        for history in history_batch:
            history_embeds = []

            for item in history:
                wav_feat = item['wav'].to(device)  # shape: (L, D)
                wav_feat = wav_feat.unsqueeze(0).to(device) # shape: (1, L, D)
                wav_embed, wav_mask = self._get_embedding_from_wav(wav_feat, torch.tensor([wav_feat.size(1)], device=device, dtype=torch.long))
                wav_embed = wav_embed.squeeze(0)  # shape: (L, D)
                if len(history_embeds) != 0:
                    history_embeds.append(user_start_embedding) # 第一个user start 不要
                history_embeds.append(wav_embed)
                history_embeds.append(user_end_embedding)
                history_embeds.append(assistant_start_embedding)
                labels = item['txt']  # shape: (L,)
                labels = torch.tensor(labels, device=device, dtype=torch.long)
                embed = self.embed_tokens(labels)  # (L2, D)，一般 L2 = L
                history_embeds.append(embed)
                history_embeds.append(assistant_end_embedding)
            history_embeds.append(user_start_embedding) # 最后添加一个user start

            if history_embeds:
                # 拼接所有历史条目的 embedding: (sum(Li), D)
                full_embed = torch.cat(history_embeds, dim=0)
                history_lens.append(full_embed.size(0))
            else:
                # 空历史
                full_embed = torch.zeros((1, self.embed_tokens.embedding_dim), device=device)
                history_lens.append(0)

            batch_embeddings.append(full_embed)

        # padding 到 batch 中最大长度
        padded_embeddings = pad_sequence(batch_embeddings, batch_first=True, padding_value=0.0)  # (B, L, D)
        history_lens = torch.tensor(history_lens, device=device, dtype=torch.long)
        padded_embeddings = padded_embeddings.to(device)

        return padded_embeddings, history_lens


    def forward(self,
                batch,
                device,
                ):
        """"""
        output_type = batch['output_type']
        # qwen_instruct_prompt_pattern_chat = "<|im_start|>system\nYou are OSUM-chat, a dialogue. You understand both the meaning and paralinguistic cues in speech, as well as input text, and respond appropriately.<|im_end|>\n<|im_start|>user\n"
        qwen_instruct_prompt_pattern_chat_s2s = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech then respond with appropriate text and emotionally matching synthetic speech.<|im_end|>\n"
        qwen_instruct_prompt_pattern_chat_s2s_think = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech. Before responding, first output your reasoning inside <think>...</think end>, analyzing the user’s words and vocal cues. Then generate a reply with appropriate text and emotionally matched synthetic speech.<|im_end|>\n"
        qwen_instruct_prompt_pattern_chat_s2s_streaming = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You analyze speech (content + paralinguistic cues) and respond with interleaved text and emotionally-matched synthetic speech.<|im_end|>\n"
        qwen_instruct_prompt_pattern_chat_s2s_streaming_think = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You analyze speech (both content and paralinguistic cues). Before responding, output your reasoning in <think>...</think end>. Then reply with interleaved text and emotionally matched synthetic speech.<|im_end|>\n"
        qwen_instruct_prompt_pattern_chat_s2t = "<|im_start|>system\nYou are OSUM-chat, a speech-to-text dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech then respond exclusively with appropriate text.<|im_end|>\n"
        qwen_instruct_prompt_pattern__chat_t2t = "<|im_start|>system\nYou are OSUM-chat, a text-to-text dialogue assistant by ASLP Lab. You understand user input in text then respond exclusively with appropriate text.<|im_end|>\n"
        qwen_instruct_prompt_pattern_1_understand = "<|im_start|>system\nYou are OSUM-chat, an audio understanding assistant by ASLP Lab. You can transcribe speech accurately and analyze paralinguistic cues to provide precise text responses.<|im_end|>\n"
        qwen_instruct_prompt_pattern_1_tts = "<|im_start|>system\nYou are OSUM-chat, a speech synthesis assistant by ASLP Lab. You generate natural and fluent speech from text input.<|im_end|>\n"
        qwen_instruct_prompt_pattern_1_tts_streaming = "<|im_start|>system\nYou are OSUM-chat, a speech synthesis assistant by ASLP Lab. You generate natural speech from text input and output both audio and the original text in interleaved format.<|im_end|>\n"
        qwen_instruct_prompt_pattern_1_old = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n"
        qwen_instruct_prompt_pattern_1_s2t_thinking = "<|im_start|>system\nYou are OSUM-chat, a thinking-enabled speech-to-text dialogue assistant by ASLP Lab. You not only comprehend the semantic meaning and paralinguistic cues in speech but also engage in deliberate reasoning to process such information. Based on this thinking process, you then respond exclusively with appropriate text.<|im_end|>\n"
        # user_start = "<|im_start|>user\n"
        # 赋予不同的系统提示。
        if output_type == "s2t_chat":
            system_prompt = qwen_instruct_prompt_pattern_chat_s2t
        elif output_type == "s2t_chat_fake":
            system_prompt = qwen_instruct_prompt_pattern_chat_s2s_think
        elif output_type == "text":
            system_prompt = qwen_instruct_prompt_pattern_1_understand
        elif output_type == "speech2text_token" or output_type == "speech2text_token_history":
            system_prompt = qwen_instruct_prompt_pattern_chat_s2s
        elif output_type == "text2token":
            system_prompt = qwen_instruct_prompt_pattern_1_tts
        elif output_type == "speech2text_token_streaming":
            system_prompt = qwen_instruct_prompt_pattern_chat_s2s_streaming
        elif output_type == "speech2text_token_think":
            system_prompt = qwen_instruct_prompt_pattern_chat_s2s_think
        elif output_type == "text2token_streaming":
            system_prompt = qwen_instruct_prompt_pattern_1_tts_streaming
        elif output_type == "text2text":
            system_prompt = qwen_instruct_prompt_pattern__chat_t2t
        elif output_type == "s2t_chat_think":
            system_prompt = qwen_instruct_prompt_pattern_1_s2t_thinking
        else:
            system_prompt = qwen_instruct_prompt_pattern_1_old
        # if output_type == "speech2text_token_history":
        # if output_type == "text2text" or output_type == "text":
        #     qwen_instruct_prompt_pattern_1 = qwen_instruct_prompt_pattern_1_old
        # elif output_type == "speech2text_token" or output_type == "speech2text_token_streaming" or output_type == "text2text" or output_type == "s2t_chat":
        #     qwen_instruct_prompt_pattern_1 = qwen_instruct_prompt_pattern_chat
        # elif output_type == "text2token":
        #     qwen_instruct_prompt_pattern_1 = qwen_instruct_prompt_pattern_1_tts
        # else:
        #     qwen_instruct_prompt_pattern_1 = qwen_instruct_prompt_pattern_1_old
        system_prompt = system_prompt +  "<|im_start|>user\n"

        rank = int(os.environ.get('RANK', 0))
        utils_file.logging_limit_print(f'xxx output_type {output_type}, rank {rank}')
        # if output_type == "s2t_chat":
        #     output_type = "text"
        # assert output_type in ['text', 'speech2text_token', 'text2token'], f"output_type:{output_type} not support"
        # speech inputs
        if output_type == 'text' or output_type == 's2t_chat' or output_type == 's2t_chat_fake' or output_type== "s2t_chat_think" or output_type == 'speech2text_token' or output_type == "speech2text_token_streaming" or output_type == "speech2text_token_think" or output_type == "speech2text_token_history":
            wavs = batch['feats'].to(device)
            # utils_file.logging_limit_print(f'xxx wav shape {wavs.shape}')
            wavs_len = batch['feats_lengths'].to(device)
            B = wavs.shape[0]
            # utils_file.logging_limit_print(f"xxx {wavs_len}")
            speech_embeds, speech_masks = self._get_embedding_from_wav(wavs, wavs_len)
            # utils_file.logging_limit_print(f'xxx speech embeding shape {speech_embeds.shape}')
            # utils_file.logging_limit_print(f'xxx speech mask shape {speech_masks.shape}')
            # utils_file.logging_limit_print(f'xxx speech mask 0 {speech_masks[0]}')
            speech_target = torch.full(speech_masks.shape, self.IGNORE_ID).to(
                speech_embeds.device)
            # utils_file.logging_limit_print(f'xxx speech target shape {speech_target.shape}')
            # utils_file.logging_limit_print(f'xxx speech target 0 {speech_target[0]}')
            # add bos and eos
            speech_embeds, speech_masks, speech_target = self._add_bos_eos(0+self.speech_token_num,
                                                                           1+self.speech_token_num,
                                                                           speech_embeds, speech_masks, speech_target)
        elif output_type == "text2token" or output_type == "text2token_streaming":
            labels = batch['target'].to(device)
            labels_lengths = batch['target_lengths'].to(device) -1  # 减1是因为要去掉eos
            B = labels.shape[0]
            #  text 2 token ,拿到文本序列,
            max_len = max(labels_lengths) + 1
            labels_pad_mask = make_pad_mask(labels_lengths, max_len=max_len)
            labels = labels.masked_fill(labels_pad_mask, 0)
            speech_embeds = self.embed_tokens(labels)  # B, L, D
            speech_target = torch.full(labels_pad_mask.shape, self.IGNORE_ID).to(
                speech_embeds.device)
            speech_masks = ~labels_pad_mask

            # add bos and eos
            # speech_embeds, speech_masks, speech_target = self._add_bos_eos(0+self.speech_token_num,
            #                                                                1 + self.speech_token_num,
            #                                                                speech_embeds, speech_masks, speech_target)
        else: # text2text
            speech_embeds = None
            speech_masks = None
            speech_target = None
        # utils_file.logging_limit_print(f'xxx after add bos eos speech embeding shape {speech_embeds.shape}')
        # utils_file.logging_limit_print(f'xxx after add bos eos speech mask shape {speech_masks.shape}')
        # utils_file.logging_limit_print(f'xxx after add bos eos speech target shape {speech_target.shape}')
        # utils_file.logging_limit_print(f'xxx after add bos eos speech mask 0 {speech_masks[0]}')
        # utils_file.logging_limit_print(f'xxx after add bos eos speech target 0 {speech_target[0]}')

        # prompt
        if 'prompt' in batch:
            prompt = batch['prompt'].to(device)
            prompt_lengths = batch['prompt_lengths'].to(device)
            prompt_pad_mask = make_pad_mask(prompt_lengths)  # B, L
            prompt = prompt.masked_fill(prompt_pad_mask, self.tokenizer.eos_token_id)
            prompt_embeds = self.embed_tokens(prompt)  # B, L, D
            prompt_target = torch.full(prompt.shape, self.IGNORE_ID).to(
                device)  # B, L
            prompt_mask = ~prompt_pad_mask
            # utils_file.logging_limit_print(f'xxx prompt embeding shape {prompt_embeds.shape}')
            # utils_file.logging_limit_print(f'xxx prompt mask shape {prompt_mask.shape}')
            # utils_file.logging_limit_print(f'xxx prompt target shape {prompt_target.shape}')
        else:
            prompt_embeds = None
            prompt_mask = None
            prompt_target = None

        inputs_embeds_list = []
        attention_mask_list = []
        target_list = []
        prompt_pattern1 = self.tokenizer([system_prompt] * len(batch['target']), return_tensors="pt"
                                         )['input_ids'].to(device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)
        prompt_pattern1_lens = torch.tensor([len(i) for i in prompt_pattern1]).to(device)
        prompt_pattern1_mask = ~make_pad_mask(prompt_pattern1_lens)
        prompt_pattern1_target = torch.full(prompt_pattern1.shape, self.IGNORE_ID).to(
            device)  # B, L

        # user_start_id = self.tokenizer([user_start] * len(batch['target']), return_tensors="pt"
        #                                 )['input_ids'].to(device)
        # user_start_embeds = self.embed_tokens(user_start_id)
        # user_start_lens = torch.tensor([len(i) for i in user_start_id]).to(device)
        # user_start_mask = ~make_pad_mask(user_start_lens)
        # user_start_target = torch.full(user_start_id.shape, self.IGNORE_ID).to(
        #     device)  # B, L

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] * len(batch['target']), return_tensors="pt"
                                         )['input_ids'].to(device)
        prompt_pattern2_embeds = self.embed_tokens(prompt_pattern2)
        prompt_pattern2_lens = torch.tensor([len(i) for i in prompt_pattern2]).to(device)
        prompt_pattern2_mask = ~make_pad_mask(prompt_pattern2_lens)
        prompt_pattern2_target = torch.full(prompt_pattern2.shape, self.IGNORE_ID).to(
            device)  # B, L

        inputs_embeds_list.append(prompt_pattern1_embeds)
        attention_mask_list.append(prompt_pattern1_mask)
        target_list.append(prompt_pattern1_target)
        streaming_error = False
        if output_type == "speech2text_token_streaming":
            rank = int(os.environ.get('RANK', 0))
            utils_file.logging_limit_print(f'开始处理speech2text_token streaming 任务')
            labels = batch['target'].to(device)
            labels_lengths = batch['target_lengths'].to(device)
            speech_token_labels = batch['speech_tokens'].to(device)
            speech_tokens_length = batch['speech_tokens_length'].to(device)

            labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
            labels = labels.masked_fill(labels_pad_mask, 0)

            speech_tokens_pad_mask = make_pad_mask(speech_tokens_length)  # B, L
            speech_token_labels = speech_token_labels.masked_fill(speech_tokens_pad_mask, 0)
            speech_token_labels = speech_token_labels + self.llm_vocab_size
            if rank == 0:
                utils_file.logging_limit_print(f'labels.shape {labels.shape}')
                utils_file.logging_limit_print(f'labels_lengths.shape {labels_lengths.shape}')
                utils_file.logging_limit_print(f'labels[0] {labels[0]}')
                utils_file.logging_limit_print(f'------------------------')
                utils_file.logging_limit_print(f'speech_token_labels.shape {speech_token_labels.shape}')
                utils_file.logging_limit_print(f'speech_tokens_length.shape {speech_tokens_length.shape}')
                utils_file.logging_limit_print(f'speech_token_labels[0] {speech_token_labels[0]}')
                utils_file.logging_limit_print(f'==========================')
            streaming_concat_ids, streaming_concat_lens = make_streaming_mode_from_s2s(labels, labels_lengths,
                                                                                       speech_token_labels,
                                                                                       speech_tokens_length)
            if rank == 0:
                utils_file.logging_limit_print(f'streaming_concat_ids.shape {streaming_concat_ids.shape}')
                utils_file.logging_limit_print(f'streaming_concat_lens.shape {streaming_concat_lens.shape}')
                utils_file.logging_limit_print(f'streaming_concat_lens {streaming_concat_lens[0]}')
                utils_file.logging_limit_print(f'xxx streaming_concat_ids[0] : {streaming_concat_ids[0]}')
                utils_file.logging_limit_print(f'------------------------')
            streaming_concat_embeddings = do_embedding_for_two_embeds(streaming_concat_ids, self.llm_vocab_size, self.embed_tokens,
                                                                      self.speech_token_emded)
            streaming_concat_pad_mask = make_pad_mask(streaming_concat_lens)
            streaming_concat_target = streaming_concat_ids.masked_fill(streaming_concat_pad_mask, self.IGNORE_ID)
            streaming_concat_mask = ~streaming_concat_pad_mask
            if rank == 0:
                utils_file.logging_limit_print(f'streaming_concat_embeddings.shape {streaming_concat_embeddings.shape}')
                utils_file.logging_limit_print(f'streaming_concat_mask shape {streaming_concat_mask.shape}')
                utils_file.logging_limit_print(f'------------------------')
            # if prompt_embeds is not None:  # 对于s2s 对话任务，不再使用user prompt 输入
            #     inputs_embeds_list.append(prompt_embeds)
            #     attention_mask_list.append(prompt_mask)
            #     target_list.append(prompt_target)

            # ===================history===================================
            history_batch = batch.get('history', [])
            history_embedding, history_lens = self._get_embedding_for_history(history_batch, device)
            if history_embedding is not None:
                utils_file.logging_info(f'OSUM-EChat： 进行历史信息的embedding')
                history_pad_mask = make_pad_mask(history_lens)  # B, L
                history_target = torch.full(history_pad_mask.shape, self.IGNORE_ID).to(device)  # B, L
                history_mask = ~history_pad_mask
                inputs_embeds_list.append(history_embedding)
                attention_mask_list.append(history_mask)
                target_list.append(history_target)
                utils_file.logging_limit_print(f'xxx history embeding shape {history_embedding.shape}')
                utils_file.logging_limit_print(f'xxx history mask shape {history_mask.shape}')
                utils_file.logging_limit_print(f'xxx history target shape {history_target.shape}')
            else:
                utils_file.logging_limit_print(f'history is None')
            # ==========================history end ===================
            inputs_embeds_list.extend(
                [  speech_embeds, prompt_pattern2_embeds, streaming_concat_embeddings])
            attention_mask_list.extend([speech_masks, prompt_pattern2_mask, streaming_concat_mask])
            target_list.extend([speech_target, prompt_pattern2_target, streaming_concat_target])
        elif output_type == "text2token_streaming":
            rank = int(os.environ.get('RANK', 0))
            utils_file.logging_limit_print(f'开始tts streaming 任务')
            labels = batch['target'].to(device)
            labels_lengths = batch['target_lengths'].to(device)
            speech_token_labels = batch['speech_tokens'].to(device)
            speech_tokens_length = batch['speech_tokens_length'].to(device)

            labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
            labels = labels.masked_fill(labels_pad_mask, 0)

            speech_tokens_pad_mask = make_pad_mask(speech_tokens_length)  # B, L
            speech_token_labels = speech_token_labels.masked_fill(speech_tokens_pad_mask, 0)
            speech_token_labels = speech_token_labels + self.llm_vocab_size
            streaming_concat_ids, streaming_concat_lens = make_streaming_mode_from_s2s(labels, labels_lengths,
                                                                                       speech_token_labels,
                                                                                       speech_tokens_length)
            streaming_concat_embeddings = do_embedding_for_two_embeds(streaming_concat_ids, self.llm_vocab_size,
                                                                      self.embed_tokens,
                                                                      self.speech_token_emded)
            streaming_concat_pad_mask = make_pad_mask(streaming_concat_lens)
            streaming_concat_target = streaming_concat_ids.masked_fill(streaming_concat_pad_mask, self.IGNORE_ID)
            streaming_concat_mask = ~streaming_concat_pad_mask
            # if prompt_embeds is not None: # 对于tts 对话任务，不再使用user prompt 输入
            #     inputs_embeds_list.append(prompt_embeds)
            #     attention_mask_list.append(prompt_mask)
            #     target_list.append(prompt_target)
            inputs_embeds_list.extend(
                [ speech_embeds, prompt_pattern2_embeds, streaming_concat_embeddings])
            attention_mask_list.extend([speech_masks, prompt_pattern2_mask, streaming_concat_mask])
            target_list.extend([speech_target, prompt_pattern2_target, streaming_concat_target])

        elif output_type == 'speech2text_token' or output_type == "speech2text_token_think" or output_type == "speech2text_token_history":
            utils_file.logging_limit_print(f'xxx 开始处理speech2text_token任务')
            labels = batch['target'].to(device)
            labels_lengths = batch['target_lengths'].to(device)
            speech_token_labels = batch['speech_tokens'].to(device)
            speech_tokens_length = batch['speech_tokens_length'].to(device)

            labels_embeds, labels_target, labels_mask = self.get_label_embedding(labels, labels_lengths)
            speech_token_labels_embeds, speech_token_labels_target, speech_token_labels_mask = self.get_speech_token_label_embedding(
                speech_token_labels, speech_tokens_length)
            # if prompt_embeds is not None: # 对于s2s 对话任务，不再使用user prompt 输入
            #     inputs_embeds_list.append(prompt_embeds)
            #     attention_mask_list.append(prompt_mask)
            #     target_list.append(prompt_target)
            # ===================history===================================
            history_batch = batch.get('history', [])
            history_embedding, history_lens = self._get_embedding_for_history(history_batch, device)
            if history_embedding is not None:
                utils_file.logging_info(f'OSUM-EChat： 进行历史信息的embedding')
                history_pad_mask = make_pad_mask(history_lens)  # B, L
                history_target = torch.full(history_pad_mask.shape, self.IGNORE_ID).to(device)  # B, L
                history_mask = ~history_pad_mask
                inputs_embeds_list.append(history_embedding)
                attention_mask_list.append(history_mask)
                target_list.append(history_target)
                utils_file.logging_limit_print(f'xxx history embeding shape {history_embedding.shape}')
                utils_file.logging_limit_print(f'xxx history mask shape {history_mask.shape}')
                utils_file.logging_limit_print(f'xxx history target shape {history_target.shape}')
            else:
                utils_file.logging_limit_print(f'history is None')
            # ==========================history end ===================
            inputs_embeds_list.extend(
                [ speech_embeds, prompt_pattern2_embeds, labels_embeds, speech_token_labels_embeds])
            attention_mask_list.extend([speech_masks, prompt_pattern2_mask, labels_mask, speech_token_labels_mask])
            target_list.extend([speech_target, prompt_pattern2_target, labels_target, speech_token_labels_target])
        elif output_type == "text2token":
            speech_token_labels = batch['speech_tokens'].to(device)
            speech_tokens_length = batch['speech_tokens_length'].to(device)
            speech_token_labels_embeds, speech_token_labels_target, speech_token_labels_mask = self.get_speech_token_label_embedding(
                speech_token_labels, speech_tokens_length)
            # if prompt_embeds is not None: # 对于tts 对话任务，不再使用user prompt 输入
            #     inputs_embeds_list.append(prompt_embeds)
            #     attention_mask_list.append(prompt_mask)
            #     target_list.append(prompt_target)
            inputs_embeds_list.extend([ speech_embeds, prompt_pattern2_embeds, speech_token_labels_embeds])
            attention_mask_list.extend([speech_masks, prompt_pattern2_mask, speech_token_labels_mask])
            target_list.extend([speech_target, prompt_pattern2_target, speech_token_labels_target])
        elif output_type == "text" or output_type == 's2t_chat' or output_type == "s2t_chat_fake" or output_type == "s2t_chat_think":
            labels = batch['target'].to(device)
            labels_lengths = batch['target_lengths'].to(device)
            labels_embeds, labels_target, labels_mask = self.get_label_embedding(labels, labels_lengths)
            if prompt_embeds is not None and output_type == 'text': # 对于s2t_chat 对话任务，不再使用user prompt 输入
                inputs_embeds_list.append(prompt_embeds)
                attention_mask_list.append(prompt_mask)
                target_list.append(prompt_target)
            elif output_type != 's2t_chat' or output_type != "s2t_chat_fake" or output_type != "s2t_chat_think":
                utils_file.logging_limit_print(
                    f'prompt is None,task: {batch["task"]}, prompt_embeds:{prompt_embeds}, prompt_mask:{prompt_mask}')
            inputs_embeds_list.extend([ speech_embeds, prompt_pattern2_embeds, labels_embeds])
            attention_mask_list.extend([speech_masks, prompt_pattern2_mask, labels_mask])
            target_list.extend([speech_target, prompt_pattern2_target, labels_target])
        elif output_type == "text2text":
            labels = batch['target'].to(device)
            labels_lengths = batch['target_lengths'].to(device)
            labels_embeds, labels_target, labels_mask = self.get_label_embedding(labels, labels_lengths)
            if prompt_embeds is not None:
                inputs_embeds_list.append(prompt_embeds)
                attention_mask_list.append(prompt_mask)
                target_list.append(prompt_target)
            else:
                utils_file.logging_limit_print(
                    f'prompt is None,task: {batch["task"]}, prompt_embeds:{prompt_embeds}, prompt_mask:{prompt_mask}')
            inputs_embeds_list.extend([ prompt_pattern2_embeds, labels_embeds])
            attention_mask_list.extend([ prompt_pattern2_mask, labels_mask])
            target_list.extend([ prompt_pattern2_target, labels_target])

        else:
            raise NotImplementedError(f'output_type {output_type} not support')

        inputs_embeds = torch.cat(inputs_embeds_list, dim=1)
        # utils_file.logging_limit_print(f'xxx final inputs_embeds shape {inputs_embeds.shape}')
        attention_mask = torch.cat(attention_mask_list, dim=1)
        # utils_file.logging_limit_print(f'xxx final attention_mask shape {attention_mask.shape}')
        # utils_file.logging_limit_print(f'xxx final attention_mask 0 {attention_mask[0]}')
        target = torch.cat(target_list, dim=1)
        # utils_file.logging_limit_print(f'xxx final  target shape {target.shape}')
        # utils_file.logging_limit_print(f'xxx final target 0 {target[0]}')
        # utils_file.logging_limit_print(f'OSUM-EChat output_type: {output_type}')
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids.masked_fill_(attention_mask == 0, 1)
        # utils_file.logging_limit_print(f'xxx final position_ids shape {position_ids.shape}')
        # utils_file.logging_limit_print(f'xxx final position_ids 0 {position_ids[0]}')
        if output_type == 'text' or output_type == 's2t_chat' or output_type == "s2t_chat_fake" or output_type == "s2t_chat_think" or output_type == "text2text":
            outputs = self.llama_model(
                inputs_embeds=inputs_embeds,
                labels=target,
                attention_mask=attention_mask,
                position_ids=position_ids.to(inputs_embeds.device)
            )
            loss = outputs['loss']
            return {"loss": loss,"output_type": output_type}
        else:
            utils_file.logging_limit_print(f'进行llama_model的 diy forward')
            outputs = self.llama_model(
                inputs_embeds=inputs_embeds,
                # labels=target,
                attention_mask=attention_mask,
                position_ids=position_ids.to(inputs_embeds.device)
            )
            hidden_states = outputs['hidden_states'][-1]
            logits = self.lm_head(hidden_states)
            logits2 = self.speech_head(hidden_states)  # speech_head
            combined_logits = torch.cat([logits, logits2], dim=-1)
            # combined_logits = self.new_lm_head(hidden_states)
            shift_logits = combined_logits[..., :-1, :].contiguous()
            shift_target = target[..., 1:].contiguous()
            # utils_file.logging_limit_print(
            #     f'xxx shift_logits shape: {shift_logits.shape}, shift_target shape: {shift_target.shape}')
            # utils_file.logging_limit_print(f'xxx shift_target 0 {shift_target[0]}')
            shift_logits = shift_logits.view(-1, combined_logits.shape[-1])  # 注意这里维度的调整，根据logits2的维度相应改变
            shift_target = shift_target.view(-1)
            shift_target = shift_target.to(shift_logits.device)
            loss = self.loss_fct(shift_logits, shift_target)
            loss.requires_grad_(True)
            return {"loss": loss,"output_type": output_type}


    def generate_s2s_streaming(
            self,
            wavs,
            wavs_len,
            prompt,
    ):
        self.llama_model.eval()
        speech_embeds, speech_masks = self._get_embedding_from_wav(wavs, wavs_len)
        speech_embeds, speech_masks, _ = self._add_bos_eos(0 +self.speech_token_num, 1+self.speech_token_num,
                                                           speech_embeds, speech_masks, None)

        device = speech_embeds.device

        prompt = self.tokenizer([prompt], return_tensors="pt"
                                )['input_ids'].to(speech_embeds.device)
        prompt_embeds = self.embed_tokens(prompt)

        qwen_instruct_prompt_pattern_1_s_input_chat = "<|im_start|>system\nYou are OSUM-chat, a dialogue assistant created by . You understand both the meaning and paralinguistic cues in users' speech, and respond appropriately with text or voice.<|im_end|>\n<|im_start|>user\n"

        qwen_instruct_prompt_pattern_1_t2t_chat = "<|im_start|>system\nYou are OSUM-chat, a dialogue assistant created by . You understand user input in text and respond with accurate and helpful text replies.<|im_end|>\n<|im_start|>user\n"
        qwen_instruct_prompt_pattern_1_old = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
        qwen_instruct_prompt_pattern_1 =qwen_instruct_prompt_pattern_1_old
        prompt_pattern1 = self.tokenizer([qwen_instruct_prompt_pattern_1], return_tensors="pt"
                                         )['input_ids'].to(device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)
        prompt_pattern1_lens = torch.tensor([len(i) for i in prompt_pattern1]).to(device)
        prompt_pattern1_mask = ~make_pad_mask(prompt_pattern1_lens)
        prompt_pattern1_target = torch.full(prompt_pattern1.shape, self.IGNORE_ID).to(
            device)  # B, L

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] , return_tensors="pt"
                                         )['input_ids'].to(device)
        start_id = prompt_pattern2[0][-1]
        new_prompt_pattern2 = prompt_pattern2[:,:-1]
        prompt_pattern2_embeds = self.embed_tokens(new_prompt_pattern2)
        prompt_pattern2_lens = torch.tensor([len(i) for i in new_prompt_pattern2],dtype=torch.long).to(device)
        prompt_pattern2_mask = ~make_pad_mask(prompt_pattern2_lens)
        prompt_pattern2_target = torch.full(new_prompt_pattern2.shape, self.IGNORE_ID).to(
            device)  # B, L

        embeds = torch.cat([prompt_pattern1_embeds,prompt_embeds, speech_embeds, prompt_pattern2_embeds], dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
        if self.embed_tokens.weight.dtype == torch.float16:
            # utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            embeds = embeds.to(torch.float16)
        max_len = 350
        hyps = [start_id]
        print(f'start_id: {start_id}')
        llm_out = self.llama_model(
            inputs_embeds=embeds,
            past_key_values=None,
            output_hidden_states=True
        )

        batch_size = 1
        top_k = 10
        top_p = 0.9
        temperature = 1
        cache = llm_out.past_key_values
        token_emb = self.embed_tokens(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
        top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
        top_ks_tensor = torch.LongTensor([top_k] * batch_size).to(device)
        temperatures_tensor = None if not temperature else torch.FloatTensor(
            [temperature] * batch_size).to(device)
        inferring_txt = True
        txt_finished = False
        speeech_finished = False
        hyps_text = ""
        speech_eos_num = 0
        txt_list = []
        token_list = []
        i_num = 0
        for i in range(max_len):
            if inferring_txt and not txt_finished:
                for i_txt in range(6):
                    i_num += 1
                    if i_num> 300:
                        break
                    llm_out = self.llama_model(
                        inputs_embeds=token_emb,
                        past_key_values=cache,
                        output_hidden_states=True
                    )
                    cache = llm_out.past_key_values
                    hidden_states = llm_out.hidden_states[-1]
                    token_logits = self.lm_head(hidden_states)
                    next_token_ids = self._sampler(
                        token_logits,
                        temperatures_tensor,
                        top_ps_tensor,
                        top_ks_tensor,
                    )
                    # next_token_ids = torch.argmax(token_logits, dim=-1)
                    print(i_num, next_token_ids, f'txt')
                    hyps.append(next_token_ids.item())
                    token_emb = self.embed_tokens(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
                    if next_token_ids == self.eos_token_id:
                        txt_finished = True
                        hyps_text = self.tokenizer.decode(txt_list, skip_special_tokens=True, add_special_tokens=False)
                        print("hyps_text:", hyps_text)
                        print("text is over")
                        break
                    txt_list.append(next_token_ids.item())
                hyps_text = self.tokenizer.decode(txt_list, skip_special_tokens=True, add_special_tokens=False)
                print("hyps_text:", hyps_text)
                inferring_txt = False
            elif not speeech_finished:
                for i_speech in range(18):
                    i_num += 1
                    if i_num> 300:
                        break
                    llm_out = self.llama_model(
                        inputs_embeds=token_emb,
                        past_key_values=cache,
                        output_hidden_states=True
                    )
                    cache = llm_out.past_key_values
                    hidden_states = llm_out.hidden_states[-1]
                    token_logits = self.speech_head(hidden_states)
                    next_token_ids = self._sampler(
                        token_logits,
                        temperatures_tensor,
                        top_ps_tensor,
                        top_ks_tensor,
                    )
                    # next_token_ids = torch.argmax(token_logits, dim=-1)
                    hyps.append(next_token_ids.item())
                    print(i_num, next_token_ids, f'speech')
                    token_emb = self.speech_token_emded(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
                    if next_token_ids == 4096:
                        speech_eos_num += 1
                        print(f'遇到 4096')
                        if speech_eos_num >= 2:
                            print("speech is over")
                            speeech_finished = True
                        break
                    token_list.append(next_token_ids.item())
                    inferring_txt = True
            if speeech_finished:
                break
            if i_num > 300:
                break
        return [hyps_text + "|" + str(token_list)]

    def generate(
            self,
            wavs,
            wavs_len,
            prompt,
            **kwargs
    ):
        self.llama_model.eval()
        self.set_task_type("ASR")
        self.do_add_speech_embed_head()
        speech_embeds, speech_masks = self._get_embedding_from_wav(wavs, wavs_len)
        speech_embeds, speech_masks, _ = self._add_bos_eos(0 + self.speech_token_num, 1 + self.speech_token_num,
                                                           speech_embeds, speech_masks, None)
        prompt = self.tokenizer([prompt], return_tensors="pt"
                                )['input_ids'].to(speech_embeds.device)
        prompt_embeds = self.embed_tokens(prompt)
        qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, an audio understanding. You can transcribe speech accurately and anaosum_echat2e paralinguistic cues to provide precise text responses.<|im_end|>\n<|im_start|>user\n"
        prompt_pattern1 = self.tokenizer([qwen_instruct_prompt_pattern_1] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern2_embeds = self.embed_tokens(prompt_pattern2)

        embeds = torch.cat([prompt_pattern1_embeds, prompt_embeds, speech_embeds, prompt_pattern2_embeds], dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)

        if self.embed_tokens.weight.dtype == torch.float16 or self.embed_tokens.weight.dtype == torch.bfloat16:
            embeds = embeds.to(torch.bfloat16)
            atts = atts.to(torch.bfloat16)
        outputs = self.llama_model.generate(
            inputs_embeds=embeds,
            max_new_tokens=self.max_length,
            cache_implementation="static",
            # num_beams=self.num_beams,
            do_sample=self.do_sample,
            min_length=self.min_length,
            top_p=self.top_p,
            top_k=self.top_k,
            repetition_penalty=self.repetition_penalty,
            length_penalty=self.length_penalty,
            temperature=self.temperature,
            # attention_mask=atts,
            eos_token_id=151645,
            pad_token_id=-100,
            stopping_criteria=self.max_token_criteria_list,
            do_compile=True,
        )
        output_text = self.tokenizer.batch_decode(outputs, add_special_tokens=False, skip_special_tokens=True)
        return output_text

    def generate4chat(
            self,
            wavs,
            wavs_len,
            prompt=" ",
            do_sample=True,
            top_k=2,
            top_p=1,
            temperature=0.4,
            **kwargs
    ):
        print(f'do_sample: {do_sample}, top_k: {top_k}, top_p: {top_p}, temperature: {temperature}')
        self.llama_model.eval()
        self.set_task_type("ASR")
        self.do_add_speech_embed_head()
        # self.do_merge_embed_head()
        speech_embeds, speech_masks = self._get_embedding_from_wav(wavs, wavs_len)
        speech_embeds, speech_masks, _ = self._add_bos_eos(0 + self.speech_token_num, 1 + self.speech_token_num,
                                                           speech_embeds, speech_masks, None)
        prompt = self.tokenizer([prompt], return_tensors="pt"
                                )['input_ids'].to(speech_embeds.device)
        prompt_embeds = self.embed_tokens(prompt)

        # qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
        # # sft
        qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, a speech-to-text dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech then respond exclusively with appropriate text.<|im_end|>\n<|im_start|>user\n"
        prompt_pattern1 = self.tokenizer([qwen_instruct_prompt_pattern_1] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern2_embeds = self.embed_tokens(prompt_pattern2)

        embeds = torch.cat([prompt_pattern1_embeds, speech_embeds, prompt_pattern2_embeds], dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)

        if self.embed_tokens.weight.dtype == torch.float16 or self.embed_tokens.weight.dtype == torch.bfloat16:
            # utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            # embeds = embeds.to(torch.float16)
            embeds = embeds.to(torch.bfloat16)
            atts = atts.to(torch.bfloat16)
        outputs = self.llama_model.generate(
            inputs_embeds=embeds,
            max_new_tokens=self.max_length,
            cache_implementation="static",
            # num_beams=1,
            do_sample=do_sample,
            min_length=self.min_length,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=self.repetition_penalty,
            length_penalty=1,
            temperature=temperature,
            # attention_mask=atts,
            eos_token_id=151645,
            pad_token_id=-100,
            do_compile=True,
            stopping_criteria=self.max_token_criteria_list,
        )

        output_text = self.tokenizer.batch_decode(outputs, add_special_tokens=False, skip_special_tokens=True)

        return output_text

    def generate4chat_think(
            self,
            wavs,
            wavs_len,
            prompt=" ",
            do_sample=True,
            top_k=2,
            top_p=1,
            temperature=0.4,
            **kwargs
    ):
        print(f'do_sample: {do_sample}, top_k: {top_k}, top_p: {top_p}, temperature: {temperature}')
        self.llama_model.eval()
        self.set_task_type("ASR")
        self.do_add_speech_embed_head()
        # self.do_merge_embed_head()
        speech_embeds, speech_masks = self._get_embedding_from_wav(wavs, wavs_len)
        speech_embeds, speech_masks, _ = self._add_bos_eos(0 + self.speech_token_num, 1 + self.speech_token_num,
                                                           speech_embeds, speech_masks, None)
        prompt = self.tokenizer([prompt], return_tensors="pt"
                                )['input_ids'].to(speech_embeds.device)
        prompt_embeds = self.embed_tokens(prompt)

        # qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
        # # sft
        qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, a thinking-enabled speech-to-text dialogue assistant by ASLP Lab. You not only comprehend the semantic meaning and paralinguistic cues in speech but also engage in deliberate reasoning to process such information. Based on this thinking process, you then respond exclusively with appropriate text.<|im_end|>\n"
        prompt_pattern1 = self.tokenizer([qwen_instruct_prompt_pattern_1] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern2_embeds = self.embed_tokens(prompt_pattern2)

        embeds = torch.cat([prompt_pattern1_embeds, speech_embeds, prompt_pattern2_embeds], dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)

        if self.embed_tokens.weight.dtype == torch.float16 or self.embed_tokens.weight.dtype == torch.bfloat16:
            # utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            # embeds = embeds.to(torch.float16)
            embeds = embeds.to(torch.bfloat16)
            atts = atts.to(torch.bfloat16)
        outputs = self.llama_model.generate(
            inputs_embeds=embeds,
            max_new_tokens=self.max_length,
            cache_implementation="static",
            # num_beams=1,
            do_sample=do_sample,
            min_length=self.min_length,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=self.repetition_penalty,
            length_penalty=1,
            temperature=temperature,
            # attention_mask=atts,
            eos_token_id=151645,
            pad_token_id=-100,
            do_compile=True,
            stopping_criteria=self.max_token_criteria_list,
        )

        output_text = self.tokenizer.batch_decode(outputs, add_special_tokens=False, skip_special_tokens=True)

        return output_text

    def generate_tts(
            self,
            device,
            text,
    ):
        # =====================准备input embedding=====================
        self.llama_model.eval()
        # 得到模型所在的device
        # device = self.llama_model.device

        self.set_task_type("TTS")
        self.do_add_speech_embed_head()
        # labels_lengths = torch.tensor([len(text[0])], dtype=torch.int64, device=device)
        # labels = text[:,:]
        labels = self.tokenizer(
            [text],
            return_tensors="pt",
            add_special_tokens=False
        ).to(
            self.embed_tokens.weight.device).input_ids  # (1, L)
        labels = labels.to(device)
        labels_lengths = torch.tensor([len(labels[0])], dtype=torch.int64, device=device)
        print(f'label_lengths:{labels_lengths}')
        print(f'labels:{labels}')
        labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
        labels = labels.masked_fill(labels_pad_mask, 0)
        speech_embeds = self.embed_tokens(labels)  # B, L, D
        speech_target = torch.full(labels_pad_mask.shape, self.IGNORE_ID).to(
            speech_embeds.device)
        speech_masks = ~labels_pad_mask
        # speech_embeds, speech_masks, speech_target = self._add_bos_eos(0 + self.speech_token_num,
        #                                                                1 + self.speech_token_num,
        #                                                                speech_embeds, speech_masks, speech_target)

        qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, a speech synthesis assistant by ASLP Lab. You generate natural and fluent speech from text input.<|im_end|>\n<|im_start|>user\n"
        prompt_pattern1 = self.tokenizer([qwen_instruct_prompt_pattern_1] * len(speech_embeds), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] * len(speech_embeds), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern2_embeds = self.embed_tokens(prompt_pattern2)
        hyps = [self.speech_token_num - 1]
        speech_begin_token_emb = self.speech_token_emded(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
        embeds = torch.cat([prompt_pattern1_embeds,
                            speech_embeds,
                            prompt_pattern2_embeds,
                            speech_begin_token_emb], dim=1).to(torch.bfloat16)
        # 指定top_k top_p temperature stop
        # max_len = 250
        top_k = 15  # 5
        top_p = 0.8  # 0.9
        temperature = 1.2  # 1

        print(f"tts eos id = {self.speech_token_num - 1}")
        llm_out = self.llama_model.generate(inputs_embeds=embeds,
                                            max_new_tokens=self.max_length,
                                            eos_token_id=self.speech_token_num - 1,
                                            cache_implementation="static",
                                            do_sample=True,
                                            temperature=temperature,
                                            top_k=top_k,
                                            top_p=top_p,
                                            stopping_criteria=StoppingCriteriaList([MaxTokenStopper(2000)]),
                                            do_compile=True,
                                            repetition_penalty=1.0,
                                            )

        return llm_out

    def generate_tts_streaming(
            self,
            device,
            prompt,
            text,
    ):
        self.llama_model.eval()
        # labels_lengths = torch.tensor([len(text[0])], dtype=torch.int64, device=device)
        # labels = text[:,:]
        labels = self.tokenizer(
            [text],
            return_tensors="pt",
            add_special_tokens=False
        ).to(
            self.embed_tokens.weight.device).input_ids  # (1, L)
        labels = labels.to(device)
        labels_lengths = torch.tensor([len(labels[0])], dtype=torch.int64, device=device)
        print(f'label_lengths:{labels_lengths}')
        print(f'labels:{labels}')
        labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
        labels = labels.masked_fill(labels_pad_mask, 0)
        speech_embeds = self.embed_tokens(labels)  # B, L, D
        speech_target = torch.full(labels_pad_mask.shape, self.IGNORE_ID).to(
            speech_embeds.device)
        speech_masks = ~labels_pad_mask
        speech_embeds, speech_masks, speech_target = self._add_bos_eos(0 + self.speech_token_num,
                                                                       1 + self.speech_token_num,
                                                                       speech_embeds, speech_masks, speech_target)

        qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
        prompt_pattern1 = self.tokenizer([qwen_instruct_prompt_pattern_1] * len(speech_embeds), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] * len(speech_embeds), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern2_embeds = self.embed_tokens(prompt_pattern2)

        prompt = self.tokenizer([prompt], return_tensors="pt"
                                )['input_ids'].to(speech_embeds.device)
        prompt_embeds = self.embed_tokens(prompt)
        # embeds = torch.cat([prompt_pattern1_embeds, prompt_embeds, speech_embeds, prompt_pattern2_embeds], dim=1)
        # ----------------
        start_id = prompt_pattern2[0][-1]
        new_prompt_pattern2 = prompt_pattern2[:, :-1]
        prompt_pattern2_embeds = self.embed_tokens(new_prompt_pattern2)
        prompt_pattern2_lens = torch.tensor([len(i) for i in new_prompt_pattern2], dtype=torch.long).to(device)
        prompt_pattern2_mask = ~make_pad_mask(prompt_pattern2_lens)
        prompt_pattern2_target = torch.full(new_prompt_pattern2.shape, self.IGNORE_ID).to(
            device)  # B, L

        embeds = torch.cat([prompt_pattern1_embeds, prompt_embeds, speech_embeds, prompt_pattern2_embeds], dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
        if self.embed_tokens.weight.dtype == torch.float16:
            # utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            embeds = embeds.to(torch.float16)
        max_len = 350
        hyps = [start_id]
        print(f'start_id: {start_id}')
        llm_out = self.llama_model(
            inputs_embeds=embeds,
            past_key_values=None,
            output_hidden_states=True
        )

        batch_size = 1
        top_k = 10
        top_p = 0.9
        temperature = 1
        cache = llm_out.past_key_values
        token_emb = self.embed_tokens(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
        top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
        top_ks_tensor = torch.LongTensor([top_k] * batch_size).to(device)
        temperatures_tensor = None if not temperature else torch.FloatTensor(
            [temperature] * batch_size).to(device)
        inferring_txt = True
        txt_finished = False
        speeech_finished = False
        hyps_text = ""
        speech_eos_num = 0
        txt_list = []
        token_list = []
        i_num = 0
        for i in range(max_len):
            if inferring_txt and not txt_finished:
                for i_txt in range(6):
                    i_num += 1
                    if i_num > 300:
                        break
                    llm_out = self.llama_model(
                        inputs_embeds=token_emb,
                        past_key_values=cache,
                        output_hidden_states=True
                    )
                    cache = llm_out.past_key_values
                    hidden_states = llm_out.hidden_states[-1]
                    token_logits = self.lm_head(hidden_states)
                    next_token_ids = self._sampler(
                        token_logits,
                        temperatures_tensor,
                        top_ps_tensor,
                        top_ks_tensor,
                    )
                    # next_token_ids = torch.argmax(token_logits, dim=-1)
                    print(i_num, next_token_ids, f'txt')
                    hyps.append(next_token_ids.item())
                    token_emb = self.embed_tokens(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
                    if next_token_ids == self.eos_token_id:
                        txt_finished = True
                        hyps_text = self.tokenizer.decode(txt_list, skip_special_tokens=True, add_special_tokens=False)
                        print("hyps_text:", hyps_text)
                        print("text is over")
                        break
                    txt_list.append(next_token_ids.item())
                hyps_text = self.tokenizer.decode(txt_list, skip_special_tokens=True, add_special_tokens=False)
                print("hyps_text:", hyps_text)
                inferring_txt = False
            elif not speeech_finished:
                for i_speech in range(18):
                    i_num += 1
                    if i_num > 300:
                        break
                    llm_out = self.llama_model(
                        inputs_embeds=token_emb,
                        past_key_values=cache,
                        output_hidden_states=True
                    )
                    cache = llm_out.past_key_values
                    hidden_states = llm_out.hidden_states[-1]
                    token_logits = self.speech_head(hidden_states)
                    next_token_ids = self._sampler(
                        token_logits,
                        temperatures_tensor,
                        top_ps_tensor,
                        top_ks_tensor,
                    )
                    # next_token_ids = torch.argmax(token_logits, dim=-1)
                    hyps.append(next_token_ids.item())
                    print(i_num, next_token_ids, f'speech')
                    token_emb = self.speech_token_emded(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
                    if next_token_ids == 4096:
                        speech_eos_num += 1
                        print(f'遇到 4096')
                        if speech_eos_num >= 2:
                            print("speech is over")
                            speeech_finished = True
                        break
                    token_list.append(next_token_ids.item())
                    inferring_txt = True
            if i_num > 300:
                break
        return [hyps_text + "|" + str(token_list)]

    def generate_text2text(
            self,
            device,
            text,
    ):
        self.llama_model.eval()
        # labels_lengths = torch.tensor([len(text[0])], dtype=torch.int64, device=device)
        # labels = text[:,:]
        labels = self.tokenizer(
            [text],
            return_tensors="pt",
            add_special_tokens=False
        ).to(
            self.embed_tokens.weight.device).input_ids  # (1, L)
        labels = labels.to(device)
        labels_lengths = torch.tensor([len(labels[0])], dtype=torch.int64, device=device)
        # print(f'label_lengths:{labels_lengths}')
        # print(f'labels:{labels}')
        labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
        labels = labels.masked_fill(labels_pad_mask, 0)
        speech_embeds = self.embed_tokens(labels)  # B, L, D
        speech_target = torch.full(labels_pad_mask.shape, self.IGNORE_ID).to(
            speech_embeds.device)
        speech_masks = ~labels_pad_mask
        # speech_embeds, speech_masks, speech_target = self._add_bos_eos(0 + self.speech_token_num,
        #                                                                1 + self.speech_token_num,
        #                                                                speech_embeds, speech_masks, speech_target)

        # qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
        # # sft
        qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, a text-to-text dialogue assistant by ASLP Lab. You understand user input in text then respond exclusively with appropriate text.<|im_end|>\n<|im_start|>user\n"
        prompt_pattern1 = self.tokenizer([qwen_instruct_prompt_pattern_1] * len(speech_embeds), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] * len(speech_embeds), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern2_embeds = self.embed_tokens(prompt_pattern2)
        embeds = torch.cat([prompt_pattern1_embeds, speech_embeds, prompt_pattern2_embeds], dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)

        if self.embed_tokens.weight.dtype == torch.float16 or self.embed_tokens.weight.dtype == torch.bfloat16:
            # utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            # embeds = embeds.to(torch.float16)
            embeds = embeds.to(torch.bfloat16)
            atts = atts.to(torch.bfloat16)
        outputs = self.llama_model.generate(
            inputs_embeds=embeds,
            max_new_tokens=200,
            num_beams=1,
            do_sample=False,
            min_length=self.min_length,
            repetition_penalty=1.0,
            length_penalty=1.0,
            temperature=self.temperature,
            attention_mask=atts,
            eos_token_id=151645,
            pad_token_id=-100,
            do_compile=True,
            cache_implementation="static",
        )
        output_text = self.tokenizer.batch_decode(outputs, add_special_tokens=False, skip_special_tokens=True)
        # output_text = [item.replace('<|endoftext|>', '') for item in output_text]
        return output_text

    def generate_s2s_no_stream_with_repetition_penalty(
            self,
            wavs,
            wavs_len,
    ):
        self.llama_model.eval()
        self.set_task_type("S2S")
        self.do_add_speech_embed_head()
        # =====================准备input embedding=====================
        speech_embeds, speech_masks = self._get_embedding_from_wav(wavs, wavs_len)
        speech_embeds, speech_masks, _ = self._add_bos_eos(0 + self.speech_token_num, None,
                                                           speech_embeds, speech_masks, None)

        device = speech_embeds.device

        # qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, a dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech, as well as input text, and respond appropriately.<|im_end|>\n<|im_start|>user\n"
        qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech then respond with appropriate text and emotionally matching synthetic speech.<|im_end|>\n<|im_start|>user\n"
        # qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
        prompt_pattern1 = self.tokenizer([qwen_instruct_prompt_pattern_1] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern2_embeds = self.embed_tokens(prompt_pattern2)


        hyps = [4098]
        token_emb = self.speech_token_emded(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)

        embeds = torch.cat(
            [prompt_pattern1_embeds, speech_embeds, token_emb, prompt_pattern2_embeds],
            dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
        if self.embed_tokens.weight.dtype == torch.float16:
            # utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            embeds = embeds.to(torch.float16)

        # max_len = 350
        top_k = 10
        top_p = 0.9
        temperature = 1.2
        invalid_eos = 10000000
        self.osum_chat_logit_processor1.init_match_found()
        llm_out = self.llama_model.generate(inputs_embeds=embeds,
                                            max_new_tokens=self.max_length,
                                            eos_token_id=invalid_eos,
                                            cache_implementation="static",
                                            do_sample=True,
                                            temperature=temperature,
                                            top_k=top_k,
                                            top_p=top_p,
                                            logits_processor=self.s2s_repetition_penalty,
                                            stopping_criteria=self.s2s_stop_criteria,
                                            do_compile=True,
                                            repetition_penalty=1.0,
                                            )

        text_eos_idx = (llm_out[0] == 151645).nonzero(as_tuple=True)[0][0].item()
        text_res = llm_out[:, :text_eos_idx - 1]
        speech_res = llm_out[:, text_eos_idx + 1:-1]

        # print("llm_out", llm_out)
        output_text = self.tokenizer.batch_decode(text_res, add_special_tokens=False, skip_special_tokens=True)
        # print(f'output_text:{output_text}')
        # print(f'speech_res:{speech_res}')
        return (output_text, text_res, speech_res)


    def generate_s2s_no_stream_think_with_repetition_penalty(
            self,
            wavs,
            wavs_len,
    ):
        self.llama_model.eval()
        self.set_task_type("S2S")
        self.do_add_speech_embed_head()
        # =====================准备input embedding=====================
        speech_embeds, speech_masks = self._get_embedding_from_wav(wavs, wavs_len)
        speech_embeds, speech_masks, _ = self._add_bos_eos(0 + self.speech_token_num, None,
                                                           speech_embeds, speech_masks, None)

        device = speech_embeds.device

        # qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, a dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech, as well as input text, and respond appropriately.<|im_end|>\n<|im_start|>user\n"
        # qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech then respond with appropriate text and emotionally matching synthetic speech.<|im_end|>\n<|im_start|>user\n"
        qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are OSUM-chat, a speech-to-speech dialogue assistant by ASLP Lab. You understand both the meaning and paralinguistic cues in speech. Before responding, first output your reasoning inside <think>...</think end>, analyzing the user’s words and vocal cues. Then generate a reply with appropriate text and emotionally matched synthetic speech.<|im_end|>\n<|im_start|>user\n"
        # qwen_instruct_prompt_pattern_1 = "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n"
        prompt_pattern1 = self.tokenizer([qwen_instruct_prompt_pattern_1] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern1_embeds = self.embed_tokens(prompt_pattern1)

        qwen_instruct_prompt_pattern_2 = "<|im_end|>\n<|im_start|>assistant\n"
        prompt_pattern2 = self.tokenizer([qwen_instruct_prompt_pattern_2] * len(wavs_len), return_tensors="pt"
                                         )['input_ids'].to(speech_embeds.device)
        prompt_pattern2_embeds = self.embed_tokens(prompt_pattern2)


        hyps = [4098]
        token_emb = self.speech_token_emded(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)

        embeds = torch.cat(
            [prompt_pattern1_embeds, speech_embeds, token_emb, prompt_pattern2_embeds],
            dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
        if self.embed_tokens.weight.dtype == torch.float16:
            # utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            embeds = embeds.to(torch.float16)

        # max_len = 350
        top_k = 10
        top_p = 0.9
        temperature = 1.2
        invalid_eos = 10000000
        self.osum_chat_logit_processor1.init_match_found() # 非think不用匹配
        llm_out = self.llama_model.generate(inputs_embeds=embeds,
                                            max_new_tokens=self.max_length,
                                            eos_token_id=invalid_eos,
                                            cache_implementation="static",
                                            do_sample=True,
                                            temperature=temperature,
                                            top_k=top_k,
                                            top_p=top_p,
                                            logits_processor=self.s2s_repetition_penalty,
                                            stopping_criteria=self.s2s_stop_criteria,
                                            do_compile=True,
                                            repetition_penalty=1.0,
                                            )

        text_eos_idx = (llm_out[0] == 151645).nonzero(as_tuple=True)[0][0].item()
        text_res = llm_out[:, :text_eos_idx - 1]
        speech_res = llm_out[:, text_eos_idx + 1:-1]

        # print("llm_out", llm_out)
        output_text = self.tokenizer.batch_decode(text_res, add_special_tokens=False, skip_special_tokens=True)
        # print(f'output_text:{output_text}')
        # print(f'speech_res:{speech_res}')
        return (output_text, text_res, speech_res)


    def _get_embedding_from_wav(self, wavs, wavs_len):
        """
        return:
        wav_embedding: (b, l, v)
        wav_mask:  (b, l), wav为有效值的位置为true
        """
        encoder_out, encoder_mask = self.encoder(wavs, wavs_len)

        speech_embeds, encoder_mask = self.down_sample_2(encoder_out, encoder_mask)
        if self.speech_transformer is not None:
            filled_wavs_len = encoder_mask.squeeze(1).sum(-1)
            speech_embeds, encoder_mask = self.speech_transformer(speech_embeds, filled_wavs_len)
            # if rank == 0:
            #     utils_file.logging_limit_print(
            #         f'out of link shape: {speech_embeds.shape},encoder的第一帧的前20个数字：\n {speech_embeds[0][0][:20]}')

            # utils_file.logging_limit_print(
            #     'get_embedding_from_wav(): speech_embeds.shape,by  self.speech_transformer(speech_embeds, speech_lens):',
            #     speech_embeds.shape)
            speech_embeds = self.speech_llama_proj(speech_embeds)
            # if rank == 0:
            #     utils_file.logging_limit_print(
            #         f'out of speech_llama_proj shape: {speech_embeds.shape},encoder的第一帧的前20个数字：\n {speech_embeds[0][0][:20]}')

        # utils_file.logging_limit_print(
        #     'get_embedding_from_wav(): speech_embeds.shape,by  self.speech_llama_proj(speech_embeds):',
        #     speech_embeds.shape)

        return speech_embeds, encoder_mask.squeeze(1)

    def _get_embedding_from_text(self, text):
        """
        将字符串先量化，再转成词向量

        Args:
            text: str

        Returns:
            text_embeds: (1, L, D)

        """
        text_id = self.tokenizer(
            text,
            return_tensors="pt",
            add_special_tokens=False
        ).to(
            self.embed_tokens.weight.device).input_ids
        text_embeds = self.embed_tokens(text_id)
        text_embeds_len = torch.tensor([text_embeds.size(1)], dtype=torch.long)
        return text_embeds, text_embeds_len

    def _add_bos_eos(self, bos, eos, inputs_embeds, attention_mask, target=None):
        B = len(inputs_embeds)
        bos_eos_target = torch.full([B, 1], self.IGNORE_ID).to(inputs_embeds.device)  # B,1
        bos_eos_mask = torch.full([B, 1], True).to(inputs_embeds.device)  # B, 1

        if bos is not None:
            bos_embed = self.speech_token_emded(torch.full([B, 1],
                                                           bos).to(inputs_embeds.device))  # B, 1, D
            inputs_embeds = torch.cat((bos_embed, inputs_embeds), 1)  # B, (1+T), D
            attention_mask = torch.cat((bos_eos_mask, attention_mask), 1)  # B, (1+T)
            if target is not None:
                target = torch.cat((bos_eos_target, target), 1)  # B, (1+T), D

        if eos is not None:
            eos_embed = self.speech_token_emded(torch.full([B, 1],
                                                           eos).to(inputs_embeds.device))  # B, 1, D
            inputs_embeds = torch.cat((inputs_embeds, eos_embed), 1)  # B, (1+T+1), D
            attention_mask = torch.cat((attention_mask, bos_eos_mask), 1)  # B, (1+T+1)
            if target is not None:
                target = torch.cat((target, bos_eos_target), 1)  # B, (1+T+1), D

        return inputs_embeds, attention_mask, target


    def infer_sample_teach_force(
            self,
            wavs,
            wavs_len,
            prompt,
            text,
            speech_token,
    ):
        labels_lengths = torch.tensor([len(text[0])], dtype=torch.int64, device=wavs.device)
        labels = text[:, :]
        labels_pad_mask = make_pad_mask(labels_lengths)  # B, L
        labels = labels.masked_fill(labels_pad_mask, 0)
        speech_embeds = self.embed_tokens(labels)  # B, L, D
        speech_target = torch.full(labels_pad_mask.shape, self.IGNORE_ID).to(
            speech_embeds.device)
        speech_masks = ~labels_pad_mask
        speech_embeds, speech_masks, speech_target = self._add_bos_eos(0 +self.speech_token_num,
                                                                       1 +self.speech_token_num,
                                                                       speech_embeds, speech_masks, speech_target)

        prompt = self.tokenizer([prompt], return_tensors="pt"
                                )['input_ids'].to(speech_embeds.device)
        prompt_embeds = self.embed_tokens(prompt)
        embeds = torch.cat([prompt_embeds, speech_embeds], dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
        if self.embed_tokens.weight.dtype == torch.float16:
            utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            embeds = embeds.to(torch.float16)
            atts = atts.half()
        device = wavs.device
        inputs_embeds = embeds.to(device)

        speech_token_list = speech_token[0].tolist()
        speech_token_list_len = len(speech_token_list)
        print(f'speech_token_list_len:{speech_token_list_len}')
        max_len = 200
        beam = 3
        beam_size = beam
        running_size = beam
        output_token = []
        hyps = [self.speech_token_num - 1]

        scores = [1.0]
        llm_out = self.llama_model(
            inputs_embeds=embeds,
            past_key_values=None,
            output_hidden_states=True
        )
        batch_size = 1
        top_k = 10
        top_p = 0.9
        temperature = 1.0
        cache = llm_out.past_key_values
        token_emb = self.speech_token_emded(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
        repetition_penalty = 1.1
        top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
        top_ks_tensor = torch.LongTensor([top_k] * batch_size).to(device)
        temperatures_tensor = None if not temperature else torch.FloatTensor(
            [temperature] * batch_size).to(device)

        for i in range(max_len):
            llm_out = self.llama_model(
                inputs_embeds=token_emb,
                past_key_values=cache,
                output_hidden_states=True
            )
            cache = llm_out.past_key_values
            hidden_states = llm_out.hidden_states[-1]
            token_logits = self.speech_head(hidden_states)
            # probs =  F.log_softmax(token_logits[:,-1], dim=-1)[0]
            next_token_ids = self._sampler(
                token_logits,
                temperatures_tensor,
                top_ps_tensor,
                top_ks_tensor,
            )
            print(i, next_token_ids)
            if next_token_ids == self.speech_token_num - 1:
                print("break la!")
                print("hyps:", hyps)
                break
            hyps.append(next_token_ids.item())
            token_emb = self.speech_token_emded(torch.tensor(speech_token_list[i]).to(device)).unsqueeze(0)
        res = []
        for i in hyps[1:]:
            # if i != self.speech_token_num-1:
            res.append(i)
        print(res)
        return [res]

    def _sampler(
            self,
            logits: torch.Tensor,
            temperatures: Union[torch.Tensor, None],
            top_ps: torch.Tensor,
            top_ks: torch.Tensor,
    ) -> torch.Tensor:
        """
        Sample from logits.
        Args:
            logits: （1，1，vocab_size）
            temperatures:
            top_ps:
            top_ks:

        Returns:

        """
        print(f'logits:{logits.shape}')
        assert logits.size(1) == 1
        logits = logits.squeeze(1)  # (batch_size, vocab_size)
        if temperatures is None:
            return torch.argmax(logits, dim=-1).squeeze(dim=-1)
        # Apply temperature scaling.
        logits.div_(temperatures.unsqueeze(dim=1))
        # Calculate probabilities with softmax.
        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
        probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
        # Apply top-p, top-k.
        probs_sum = torch.cumsum(probs_sort, dim=-1)
        top_ps_mask = (probs_sum - probs_sort) > top_ps.unsqueeze(dim=1)
        probs_sort = torch.where(top_ps_mask, 0, probs_sort)

        top_ks_mask = torch.arange(probs_idx.shape[-1], device=probs_idx.device)
        top_ks_mask = top_ks_mask.expand(probs_idx.shape[0], -1)
        top_ks_mask = top_ks_mask >= top_ks.unsqueeze(dim=1)
        probs_sort = torch.where(top_ks_mask, 0, probs_sort)
        # Re-normalization.
        probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
        probs = torch.gather(probs_sort,
                             dim=-1,
                             index=torch.argsort(probs_idx, dim=-1))
        next_token_ids = torch.multinomial(probs, num_samples=1,
                                           replacement=True).squeeze(dim=-1)
        return next_token_ids

    def infer_sample4speech2text_token_teacher_force(
            self,
            wavs,
            wavs_len,
            prompt,
            speech_token=None,
            answer_text=None,
    ):
        self.llama_model.eval()
        speech_embeds, speech_masks = self._get_embedding_from_wav(wavs, wavs_len)
        speech_embeds, speech_masks, _ = self._add_bos_eos(0 +self.speech_token_num, 1+self.speech_token_num,
                                                           speech_embeds, speech_masks, None)

        device = speech_embeds.device

        prompt = self.tokenizer([prompt], return_tensors="pt"
                                )['input_ids'].to(speech_embeds.device)
        prompt_embeds = self.embed_tokens(prompt)
        text_token = self.tokenizer([answer_text], return_tensors="pt"
                                    )['input_ids'].to(speech_embeds.device)
        text_token_embeds = self.embed_tokens(text_token)
        embeds = torch.cat([prompt_embeds, speech_embeds, text_token_embeds], dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
        if self.embed_tokens.weight.dtype == torch.float16:
            utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            embeds = embeds.to(torch.float16)
            atts = atts.half()
        inputs_embeds = embeds.to(speech_embeds.device)

        max_len = 150
        beam = 3
        beam_size = beam
        running_size = beam
        output_token = []
        hyps = [self.speech_token_num]
        hyps_text = ""

        scores = [1.0]
        llm_out = self.llama_model(
            inputs_embeds=embeds,
            past_key_values=None,
            output_hidden_states=True
        )
        # speech_token_list = speech_token[0]
        # speech_token_list_len = len(speech_token_list)
        if speech_token is not None:
            print(f'speech_token_list_len:{len(speech_token[0])}')
            print(f'speech_token:{speech_token[0]}')

        batch_size = 1
        top_k = 10
        top_p = 0.9
        temperature = 1.2
        cache = llm_out.past_key_values
        token_emb = self.speech_token_emded(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
        repetition_penalty = 1.1
        top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
        top_ks_tensor = torch.LongTensor([top_k] * batch_size).to(device)
        temperatures_tensor = None if not temperature else torch.FloatTensor(
            [temperature] * batch_size).to(device)
        is_speech_token = False
        speech_eos_num = 0
        for i in range(max_len):
            llm_out = self.llama_model(
                inputs_embeds=token_emb,
                past_key_values=cache,
                output_hidden_states=True
            )
            cache = llm_out.past_key_values
            hidden_states = llm_out.hidden_states[-1]
            token_logits = self.speech_head(hidden_states)
            # probs =  F.log_softmax(token_logits[:,-1], dim=-1)[0]
            # if i ==2 or i == 80:
            #     torch.save(probs, f'probs_{i}.pt')
            next_token_ids = self._sampler(
                token_logits,
                temperatures_tensor,
                top_ps_tensor,
                top_ks_tensor,
            )
            print(i, next_token_ids, f'is_speech_token:{is_speech_token}')
            if next_token_ids == self.speech_token_num - 1:
                print(f'遇到 4096')
                break
            hyps.append(next_token_ids.item())
            # if 1+i >= len(speech_token[0]):
            #     break
            # token_emb = self.speech_token_emded(torch.tensor([speech_token[0][i+1]]).to(device)).unsqueeze(0)
            token_emb = self.embed_tokens(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
        res = []
        for i in hyps[1:]:
            # if i != self.speech_token_num-1:
            res.append(i)
        print(res)
        return [answer_text + str(res[2:])]

    def infer_sample4speech2text_token_teacher_force2(
            self,
            wavs,
            wavs_len,
            prompt,
            speech_token=None,
            answer_text=None,
    ):
        self.llama_model.eval()
        speech_embeds, speech_masks = self._get_embedding_from_wav(wavs, wavs_len)
        speech_embeds, speech_masks, _ = self._add_bos_eos(0 +self.speech_token_num, 1+self.speech_token_num ,
                                                           speech_embeds, speech_masks, None)

        device = speech_embeds.device

        prompt = self.tokenizer([prompt], return_tensors="pt"
                                )['input_ids'].to(speech_embeds.device)
        prompt_embeds = self.embed_tokens(prompt)
        text_token = self.tokenizer([answer_text], return_tensors="pt"
                                    )['input_ids'].to(speech_embeds.device)
        # text_token_embeds = self.embed_tokens(text_token)
        embeds = torch.cat([prompt_embeds, speech_embeds], dim=1)
        atts = torch.ones(embeds.size()[:-1], dtype=torch.long).to(embeds.device)
        if self.embed_tokens.weight.dtype == torch.float16:
            utils_file.logging_limit_print('generate(): self.embed_tokens.weight.dtype == torch.float16')
            embeds = embeds.to(torch.float16)
            atts = atts.half()
        inputs_embeds = embeds.to(speech_embeds.device)

        max_len = 150
        beam = 3
        beam_size = beam
        running_size = beam
        output_token = []
        hyps = [self.speech_token_num - 1]
        hyps_text = ""

        scores = [1.0]
        llm_out = self.llama_model(
            inputs_embeds=embeds,
            past_key_values=None,
            output_hidden_states=True
        )
        # speech_token_list = speech_token[0]
        # speech_token_list_len = len(speech_token_list)
        if speech_token is not None:
            print(f'speech_token_list_len:{len(speech_token)}')
            print(f'speech_token:{speech_token}')

        batch_size = 1
        top_k = 10
        top_p = 0.9
        temperature = 1.2
        cache = llm_out.past_key_values
        token_emb = self.speech_token_emded(torch.tensor(hyps[-1:]).to(device)).unsqueeze(0)
        repetition_penalty = 1.1
        top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
        top_ks_tensor = torch.LongTensor([top_k] * batch_size).to(device)
        temperatures_tensor = None if not temperature else torch.FloatTensor(
            [temperature] * batch_size).to(device)
        is_speech_token = False
        speech_eos_num = 0
        token_num = len(speech_token)
        for i in range(token_num):
            llm_out = self.llama_model(
                inputs_embeds=token_emb,
                past_key_values=cache,
                output_hidden_states=True
            )
            cache = llm_out.past_key_values
            hidden_states = llm_out.hidden_states[-1]
            token_logits = self.speech_head(hidden_states)
            # probs =  F.log_softmax(token_logits[:,-1], dim=-1)[0]
            # if i ==2 or i == 80:
            #     torch.save(probs, f'probs_{i}.pt')
            next_token_ids = self._sampler(
                token_logits,
                temperatures_tensor,
                top_ps_tensor,
                top_ks_tensor,
            )
            print(i, next_token_ids, f'is_speech_token:{is_speech_token}')
            if next_token_ids == self.speech_token_num - 1:
                print(f'遇到 4096')
                break
            hyps.append(next_token_ids.item())
            # if 1+i >= len(speech_token[0]):
            #     break
            # token_emb = self.speech_token_emded(torch.tensor([speech_token[0][i+1]]).to(device)).unsqueeze(0)
            token_emb = self.embed_tokens(torch.tensor([speech_token[i]]).to(device)).unsqueeze(0)
        res = []
        for i in hyps[1:]:
            # if i != self.speech_token_num-1:
            res.append(i)
        print(res)
        return [hyps_text + str(res)]