from transformers import AutoTokenizer, AutoModelForCausalLM | |
from llmcompressor.transformers import oneshot | |
from llmcompressor.modifiers.quantization import QuantizationModifier | |
MODEL_ID = "perplexity-ai/r1-1776" | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
# Configure the simple PTQ quantization | |
recipe = QuantizationModifier( | |
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head","re:.*mlp.gate$"]) | |
# Apply the quantization algorithm. | |
oneshot(model=model, recipe=recipe, trust_remote_code_model=True) | |
# Save the model. | |
SAVE_DIR = "output/" + MODEL_ID.split("/")[1] + "-FP8-Dynamic" | |
model.save_pretrained(SAVE_DIR) | |
tokenizer.save_pretrained(SAVE_DIR) | |