GGUF

#2
by islameissa - opened

trying to convert to GGUF but the transformers does not match.

Hello, we have now updated the Hugging Face compatible model in the main branch. You can load it directly using the following code:

from transformers import AutoModelForCausalLM, AutoProcessor
import torch

model_path = "ZJU-AI4H/Hulu-Med-14B"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
)

processor = AutoProcessor.from_pretrained(
    model_path,
    trust_remote_code=True
)
tokenizer = processor.tokenizer

Thank you very much.

I have to modify this section in convert_hf_to_gguf.py

@ModelBase .register("Qwen3ForCausalLM", "HulumedQwen3ForCausalLM")
class Qwen3Model(Qwen2Model):
model_arch = gguf.MODEL_ARCH.QWEN3

# extra logic for rerank models
is_rerank: bool = False
is_tied_embeddings: bool = False
token_false_id: int | None = None
token_true_id: int | None = None

def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)

    # track for intern-s1-mini
    hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
    self.origin_hf_arch = hparams.get('architectures', [None])[0]

    # force MoE off in metadata – these keys vary across repos
    for k in (
        "moe_num_experts",
        "num_experts",
        "n_experts",
        "expert_count",
        "n_routed_experts",
        "num_experts_per_tok",
        "moe_top_k",
        "top_k",
        "expert_used_count",
    ):
        if k in self.hparams:
            self.hparams[k] = 0

    # a bit hacky, but currently the only way to detect if this is a rerank model
    # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
    readme_path = self.dir_model / "README.md"
    readme_text = ""
    if readme_path.exists():
        with readme_path.open("r", encoding="utf-8") as f:
            readme_text = f.read()
    if "# Qwen3-Reranker" in readme_text:
        self._find_rerank_config()

def set_vocab(self):
    # deal with intern-s1-mini
    if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
        self._set_vocab_interns1()
        return

    super().set_vocab()

def _find_rerank_config(self):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(self.dir_model)

    self.is_rerank = True
    self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
    self.token_false_id = tokenizer.convert_tokens_to_ids("no")
    self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
    self.sep_token_id = tokenizer.convert_tokens_to_ids("|")

    assert self.token_false_id is not None and self.token_true_id is not None

def set_gguf_parameters(self):
    # Ensure hparams seen by the base writer advertise no experts
    for k in ("expert_count", "expert_used_count"):
        self.hparams[k] = 0

    super().set_gguf_parameters()

    # Force the final GGUF KV to be dense (no experts), regardless of what the base wrote
    # Most writers support add_uint32 and will overwrite; fall back to add_key_value if needed.
    if hasattr(self.gguf_writer, "add_uint32"):
        self.gguf_writer.add_uint32("qwen3.expert_count", 0)
        self.gguf_writer.add_uint32("qwen3.expert_used_count", 0)
    elif hasattr(self.gguf_writer, "add_key_value"):
        self.gguf_writer.add_key_value("qwen3.expert_count", 0)
        self.gguf_writer.add_key_value("qwen3.expert_used_count", 0)

    # Rerank extras unchanged
    if self.is_rerank:
        self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
        self.gguf_writer.add_classifier_output_labels(["yes", "no"])
        self.gguf_writer.add_chat_template([{
            "name": "rerank",
            "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
                        "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
                        "<|im_start|>assistant\n<think>\n\n</think>\n\n"
        }])


def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
    # extract "yes" and "no" tokens from the output lm_head tensor
    false_row = data_torch[self.token_false_id]
    true_row = data_torch[self.token_true_id]
    return torch.stack([true_row, false_row], dim=0)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
    # 1) Strip vision / multimodal tensors that llama.cpp's text Qwen path can't map
            #    Returning [] tells the exporter to write nothing for this tensor.
    skip_prefixes = (
        "model.mm_projector.",
        "model.vision_tower.",
        "model.mm_resampler.",
        "model.mm_mlp.",
        "model.vision_encoder.",          # new: catches embeddings.patch_embedding.*
        "vision_encoder.",                # safety for repos that omit "model."
        "vision_",
        "visual_",
        "image_",
        "mm_projector.",
        "projector.",
        "clip_",
    )
    # fast path prefix check
    if name.startswith(skip_prefixes):
        # print(f"Skipping tensor by prefix: {name}")
        return []

    # defensive contains-based skip for any missed VL leftovers
    if any(k in name for k in (".mm_", ".vision_", ".image_", ".visual_", "vision.encoder", "vision_encoder")):
        # print(f"Skipping tensor by contains: {name}")
        return []

    # 2) Existing reranker head logic
    if self.is_rerank:
        is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
        is_real_head = not self.is_tied_embeddings and "lm_head" in name
        if is_tied_head or is_real_head:
            cls_out_head = (
                gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
                self._get_cls_out_tensor(data_torch),
            )
            if is_tied_head:
                embed = (self.map_tensor_name(name), data_torch)
                return [cls_out_head, embed]
            if is_real_head:
                return [cls_out_head]

    # 3) Everything else through the standard Qwen mapping
    return super().modify_tensors(data_torch, name, bid)

you can see that I had to drop all visual tensors and add the model arch name to qwen3 arch class

let me know if you have any comments.

Hello! I've checked the llama.cpp GitHub repository, and it seems currently only supports the text language model like Qwen3, with multimodal capabilities like Qwen3-VL not yet integrated. Therefore, some modifications to the official code might be necessary, and we plan to transition to GGUF in the future.

Sign up or log in to comment