Spaces:
Running
on
Zero
Running
on
Zero
# SPDX-License-Identifier: Apache-2.0 | |
# Adapted from | |
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py | |
# Copyright 2024 The Qwen team. | |
# Copyright 2023 The vLLM team. | |
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. | |
# | |
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX | |
# and OPT implementations in this library. It has been modified from its | |
# original forms to accommodate minor architectural differences compared | |
# to GPT-NeoX and OPT used by the Meta AI team that trained the model. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Inference-only Qwen2 model compatible with HuggingFace weights.""" | |
from vllm.model_executor.models.qwen2 import * | |
class CosyVoice2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): | |
packed_modules_mapping = { | |
"qkv_proj": [ | |
"q_proj", | |
"k_proj", | |
"v_proj", | |
], | |
"gate_up_proj": [ | |
"gate_proj", | |
"up_proj", | |
], | |
} | |
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): | |
super().__init__() | |
config = vllm_config.model_config.hf_config | |
quant_config = vllm_config.quant_config | |
lora_config = vllm_config.lora_config | |
self.config = config | |
self.lora_config = lora_config | |
self.quant_config = quant_config | |
self.model = Qwen2Model(vllm_config=vllm_config, | |
prefix=maybe_prefix(prefix, "model")) | |
if get_pp_group().is_last_rank: | |
if config.tie_word_embeddings: | |
self.lm_head = self.model.embed_tokens | |
else: | |
self.lm_head = ParallelLMHead(config.vocab_size, | |
config.hidden_size, | |
True, | |
quant_config=quant_config, | |
prefix=maybe_prefix( | |
prefix, "lm_head")) | |
else: | |
self.lm_head = PPMissingLayer() | |
self.logits_processor = LogitsProcessor(config.vocab_size) | |
self.make_empty_intermediate_tensors = ( | |
self.model.make_empty_intermediate_tensors) | |
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: | |
return self.model.get_input_embeddings(input_ids) | |
def forward( | |
self, | |
input_ids: torch.Tensor, | |
positions: torch.Tensor, | |
intermediate_tensors: Optional[IntermediateTensors] = None, | |
inputs_embeds: Optional[torch.Tensor] = None, | |
) -> Union[torch.Tensor, IntermediateTensors]: | |
hidden_states = self.model(input_ids, positions, intermediate_tensors, | |
inputs_embeds) | |
return hidden_states | |
def compute_logits( | |
self, | |
hidden_states: torch.Tensor, | |
sampling_metadata: SamplingMetadata, | |
) -> Optional[torch.Tensor]: | |
logits = self.logits_processor(self.lm_head, hidden_states, | |
sampling_metadata, self.lm_head.bias) | |
return logits | |
def load_weights(self, weights: Iterable[tuple[str, | |
torch.Tensor]]) -> set[str]: | |
loader = AutoWeightsLoader( | |
self, | |
skip_prefixes=(["lm_head."] | |
if self.config.tie_word_embeddings else None), | |
) | |
return loader.load_weights(weights) | |