How to quant:

git clone https://github.com/vllm-project/llm-compressor.git

cd llm-compressor

pip install -e .

pip install transformers==4.57.0

export TOKENIZERS_PARALLELISM=false

cd ..

Script For quant:

Save the next code like quant.py.

python3 quant.py

import torch
from datasets import load_dataset
from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modeling import replace_modules_for_calibration
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation

# NOTE: Requires a minimum of transformers 4.57.0

MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"

# Load model.
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
    MODEL_ID, 
    torch_dtype=torch.bfloat16,
    device_map=None,
    trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = replace_modules_for_calibration(model)

DATASET_ID = "neuralmagic/calibration"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 8192

ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)


def preprocess_function(example):
    messages = []
    for message in example["messages"]:
        messages.append(
            {
                "role": message["role"],
                "content": [{"type": "text", "text": message["content"]}],
            }
        )

    return processor.apply_chat_template(
        messages,
        return_tensors="pt",
        padding=False,
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
        tokenize=True,
        add_special_tokens=False,
        return_dict=True,
        add_generation_prompt=False,
    )


ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)


def data_collator(batch):
    assert len(batch) == 1
    return {
        key: (
            torch.tensor(value)
            if key != "pixel_values"
            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
        )
        for key, value in batch[0].items()
    }


# Configure AWQ quantization with smoothing and balancing
recipe = AWQModifier(
    ignore=[
        're:.*embed_tokens', 
        're:.*input_layernorm$', 
        're:.*mlp[.]gate$', 
        're:.*post_attention_layernorm$', 
        're:.*norm$', 
        're:model[.]visual.*',
        're:visual.*',
        'lm_head'
    ],
    mappings=[
        {
            "smooth_layer": "re:.*input_layernorm$",
            "balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
        },
        {
            "smooth_layer": "re:.*v_proj$",
            "balance_layers": ['re:.*o_proj$']
        },
        {
            "smooth_layer": "re:.*post_attention_layernorm$",
            "balance_layers": ['re:.*gate_proj$', 're:.*up_proj$']
        },
        {
            "smooth_layer": "re:.*up_proj$",
            "balance_layers": ['re:.*down_proj$']
        }
    ],
    duo_scaling=True,
    config_groups={
        "group_0": {
            "targets": ["Linear"],
            "weights": {
                "num_bits": 8,
                "type": "int",
                "symmetric": True,
                "group_size": 32,
                "strategy": "group",
                "block_structure": None,
                "dynamic": False,
                "actorder": None,
                "observer": "mse",
                "observer_kwargs": {}
            },
            "input_activations": None,
            "output_activations": None,
            "format": None
        }
    }
)

# Apply AWQ quantization.
oneshot(
    model=model,
    processor=processor,
    recipe=recipe,
    dataset=ds,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    data_collator=data_collator,

)

print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(processor.decode(output[0]))
print("==========================================")

# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-seq"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
Downloads last month
143
Safetensors
Model size
10B params
Tensor type
I64
I32
BF16
Inference Providers NEW
This model isn't deployed by any Inference Provider. 馃檵 Ask for provider support

Model tree for jart25/Qwen3-VL-30B-A3B-Instruct-AWQ-8bit

Quantized
(14)
this model