GGUF
trying to convert to GGUF but the transformers does not match.
Hello, we have now updated the Hugging Face compatible model in the main branch. You can load it directly using the following code:
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
model_path = "ZJU-AI4H/Hulu-Med-14B"
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(
model_path,
trust_remote_code=True
)
tokenizer = processor.tokenizer
Thank you very much.
I have to modify this section in convert_hf_to_gguf.py
@ModelBase
.register("Qwen3ForCausalLM", "HulumedQwen3ForCausalLM")
class Qwen3Model(Qwen2Model):
model_arch = gguf.MODEL_ARCH.QWEN3
# extra logic for rerank models
is_rerank: bool = False
is_tied_embeddings: bool = False
token_false_id: int | None = None
token_true_id: int | None = None
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# track for intern-s1-mini
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
self.origin_hf_arch = hparams.get('architectures', [None])[0]
# force MoE off in metadata β these keys vary across repos
for k in (
"moe_num_experts",
"num_experts",
"n_experts",
"expert_count",
"n_routed_experts",
"num_experts_per_tok",
"moe_top_k",
"top_k",
"expert_used_count",
):
if k in self.hparams:
self.hparams[k] = 0
# a bit hacky, but currently the only way to detect if this is a rerank model
# ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
readme_path = self.dir_model / "README.md"
readme_text = ""
if readme_path.exists():
with readme_path.open("r", encoding="utf-8") as f:
readme_text = f.read()
if "# Qwen3-Reranker" in readme_text:
self._find_rerank_config()
def set_vocab(self):
# deal with intern-s1-mini
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
self._set_vocab_interns1()
return
super().set_vocab()
def _find_rerank_config(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
self.is_rerank = True
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
assert self.token_false_id is not None and self.token_true_id is not None
def set_gguf_parameters(self):
# Ensure hparams seen by the base writer advertise no experts
for k in ("expert_count", "expert_used_count"):
self.hparams[k] = 0
super().set_gguf_parameters()
# Force the final GGUF KV to be dense (no experts), regardless of what the base wrote
# Most writers support add_uint32 and will overwrite; fall back to add_key_value if needed.
if hasattr(self.gguf_writer, "add_uint32"):
self.gguf_writer.add_uint32("qwen3.expert_count", 0)
self.gguf_writer.add_uint32("qwen3.expert_used_count", 0)
elif hasattr(self.gguf_writer, "add_key_value"):
self.gguf_writer.add_key_value("qwen3.expert_count", 0)
self.gguf_writer.add_key_value("qwen3.expert_used_count", 0)
# Rerank extras unchanged
if self.is_rerank:
self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
self.gguf_writer.add_classifier_output_labels(["yes", "no"])
self.gguf_writer.add_chat_template([{
"name": "rerank",
"template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
"<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
"<|im_start|>assistant\n<think>\n\n</think>\n\n"
}])
def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
# extract "yes" and "no" tokens from the output lm_head tensor
false_row = data_torch[self.token_false_id]
true_row = data_torch[self.token_true_id]
return torch.stack([true_row, false_row], dim=0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# 1) Strip vision / multimodal tensors that llama.cpp's text Qwen path can't map
# Returning [] tells the exporter to write nothing for this tensor.
skip_prefixes = (
"model.mm_projector.",
"model.vision_tower.",
"model.mm_resampler.",
"model.mm_mlp.",
"model.vision_encoder.", # new: catches embeddings.patch_embedding.*
"vision_encoder.", # safety for repos that omit "model."
"vision_",
"visual_",
"image_",
"mm_projector.",
"projector.",
"clip_",
)
# fast path prefix check
if name.startswith(skip_prefixes):
# print(f"Skipping tensor by prefix: {name}")
return []
# defensive contains-based skip for any missed VL leftovers
if any(k in name for k in (".mm_", ".vision_", ".image_", ".visual_", "vision.encoder", "vision_encoder")):
# print(f"Skipping tensor by contains: {name}")
return []
# 2) Existing reranker head logic
if self.is_rerank:
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
is_real_head = not self.is_tied_embeddings and "lm_head" in name
if is_tied_head or is_real_head:
cls_out_head = (
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
self._get_cls_out_tensor(data_torch),
)
if is_tied_head:
embed = (self.map_tensor_name(name), data_torch)
return [cls_out_head, embed]
if is_real_head:
return [cls_out_head]
# 3) Everything else through the standard Qwen mapping
return super().modify_tensors(data_torch, name, bid)
you can see that I had to drop all visual tensors and add the model arch name to qwen3 arch class
let me know if you have any comments.
Hello! I've checked the llama.cpp GitHub repository, and it seems currently only supports the text language model like Qwen3, with multimodal capabilities like Qwen3-VL not yet integrated. Therefore, some modifications to the official code might be necessary, and we plan to transition to GGUF in the future.