Quantization-Attempts / load_gguf.py
Rúben Almeida
First Commit with /health route and Dockerfile
d75b820
raw
history blame
823 Bytes
from transformers import AutoModel
base_model = "ibm-research/granite-3.2-8b-instruct-GGUF"
GGUF_MODEL = "granite-3.2-8b-instruct-Q4_K_M.gguf"
#model = AutoModel.from_pretrained("ibm-research/granite-3.2-8b-instruct", device_map="auto")
model = AutoModel.from_pretrained(base_model, device_map="auto", torch_dtype="auto", quantization_config=None, gguf_file=GGUF_MODEL)
model.config
"""
# pip install gguf
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
torch_dtype = torch.float32 # could be torch.float16 or torch.bfloat16 too
tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename, torch_dtype=torch_dtype)
"""