r1-1776-FP8-Dynamic / llm-compressor /r1_1776_moe_w8a8_fp8dyn.py

Add files using upload-large-folder tool

daf9d1d verified 6 months ago

817 Bytes

	from transformers import AutoTokenizer, AutoModelForCausalLM
	from llmcompressor.transformers import oneshot
	from llmcompressor.modifiers.quantization import QuantizationModifier


	MODEL_ID = "perplexity-ai/r1-1776"

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

	# Configure the simple PTQ quantization
	recipe = QuantizationModifier(
	targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head","re:.*mlp.gate$"])

	# Apply the quantization algorithm.
	oneshot(model=model, recipe=recipe, trust_remote_code_model=True)

	# Save the model.
	SAVE_DIR = "output/" + MODEL_ID.split("/")[1] + "-FP8-Dynamic"
	model.save_pretrained(SAVE_DIR)
	tokenizer.save_pretrained(SAVE_DIR)