How to quant:
git clone https://github.com/vllm-project/llm-compressor.git
cd llm-compressor
pip install -e .
pip install transformers==4.57.0
export TOKENIZERS_PARALLELISM=false
cd ..
Script For quant:
Save the next code like quant.py.
python3 quant.py
import torch
from datasets import load_dataset
from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
from llmcompressor import oneshot
from llmcompressor.modeling import replace_modules_for_calibration
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.utils import dispatch_for_generation
# NOTE: Requires a minimum of transformers 4.57.0
MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct"
# Load model.
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map=None,
trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = replace_modules_for_calibration(model)
DATASET_ID = "neuralmagic/calibration"
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 8192
ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)
def preprocess_function(example):
messages = []
for message in example["messages"]:
messages.append(
{
"role": message["role"],
"content": [{"type": "text", "text": message["content"]}],
}
)
return processor.apply_chat_template(
messages,
return_tensors="pt",
padding=False,
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
tokenize=True,
add_special_tokens=False,
return_dict=True,
add_generation_prompt=False,
)
ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
def data_collator(batch):
assert len(batch) == 1
return {
key: (
torch.tensor(value)
if key != "pixel_values"
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
)
for key, value in batch[0].items()
}
# Configure AWQ quantization with smoothing and balancing
recipe = AWQModifier(
ignore=[
're:.*embed_tokens',
're:.*input_layernorm$',
're:.*mlp[.]gate$',
're:.*post_attention_layernorm$',
're:.*norm$',
're:model[.]visual.*',
're:visual.*',
'lm_head'
],
mappings=[
{
"smooth_layer": "re:.*input_layernorm$",
"balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
},
{
"smooth_layer": "re:.*v_proj$",
"balance_layers": ['re:.*o_proj$']
},
{
"smooth_layer": "re:.*post_attention_layernorm$",
"balance_layers": ['re:.*gate_proj$', 're:.*up_proj$']
},
{
"smooth_layer": "re:.*up_proj$",
"balance_layers": ['re:.*down_proj$']
}
],
duo_scaling=True,
config_groups={
"group_0": {
"targets": ["Linear"],
"weights": {
"num_bits": 8,
"type": "int",
"symmetric": True,
"group_size": 32,
"strategy": "group",
"block_structure": None,
"dynamic": False,
"actorder": None,
"observer": "mse",
"observer_kwargs": {}
},
"input_activations": None,
"output_activations": None,
"format": None
}
}
)
# Apply AWQ quantization.
oneshot(
model=model,
processor=processor,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
data_collator=data_collator,
)
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=20)
print(processor.decode(output[0]))
print("==========================================")
# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-seq"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
- Downloads last month
- 143
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
馃檵
Ask for provider support
Model tree for jart25/Qwen3-VL-30B-A3B-Instruct-AWQ-8bit
Base model
Qwen/Qwen3-VL-30B-A3B-Instruct