TurkGPT / app.py
sedataskan's picture
Update app.py
093e2ac verified
import torch
import transformers
import bitsandbytes
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
new_model = "sedataskan/mistral8x7B-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
load_in_4bit=False,
torch_dtype=torch.float16,
device_map="auto")
tokenizer.pad_token = "!"
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1
config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
target_modules=[ "w1", "w2", "w3"], # Only Training the "expert" layers
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
def print_trainable_parameters(m):
trainable_params = sum(p.numel() for p in m.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in m.parameters())
print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params}")
print_trainable_parameters(model)
train_data = load_dataset("oscar-corpus/OSCAR-2201", "tr")
print("Dataset", train_data)
def generate_prompt(user_query, sep="\n\n### "): #The prompt format is taken from the official Mixtral huggingface page
sys_msg= "Take a look at the following instructions and try to follow them."
p = "<s> [INST]" + sys_msg +"\n"+ user_query["instruction"] + "[/INST]" + user_query["output"] + "</s>"
return p
max_len = 1024
def tokenize(prompt):
return tokenizer(
prompt + tokenizer.eos_token,
truncation=True,
max_length=max_len,
padding="max_length"
)
train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=["instruction" , "output"])
trainer = Trainer(
model=model,
train_dataset=train_data,
args=TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
num_train_epochs=6,
learning_rate=1e-4,
logging_steps=2,
optim="adamw_torch",
save_strategy="epoch"
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False
# Train model
trainer.train()
# Save trained model
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)
# Push them to the HF Hub
trainer.model.push_to_hub(new_model, use_temp_dir=False, token="")
tokenizer.push_to_hub(new_model, use_temp_dir=False, token="")
# # Format prompt
# message = [
# "Türkiye'nin başkenti neresidir?"
# ]
# tokenizer = AutoTokenizer.from_pretrained(new_model)
# prompt = tokenizer(message, return_tensors="pt", padding=True)
# # Generate output
# output = trainer.model.generate(
# input_ids=prompt.input_ids,
# attention_mask=prompt.attention_mask,
# max_length=128,
# do_sample=True,
# top_p=0.95,
# top_k=60,
# num_return_sequences=1,
# )
# # Print output
# print(tokenizer.batch_decode(output, skip_special_tokens=True))