Spaces:

sedataskan
/

TurkGPT

Runtime error

File size: 3,327 Bytes

import torch
import transformers
import bitsandbytes
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
new_model = "sedataskan/mistral8x7B-finetuned"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                            load_in_4bit=False,
                                            torch_dtype=torch.float16,
                                            device_map="auto")
tokenizer.pad_token = "!"

LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=[ "w1", "w2", "w3"],  # Only Training the "expert" layers
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

def print_trainable_parameters(m):
    trainable_params = sum(p.numel() for p in m.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in m.parameters())
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params / all_params}")

print_trainable_parameters(model)

train_data = load_dataset("oscar-corpus/OSCAR-2201", "tr")
print("Dataset", train_data)

def generate_prompt(user_query,  sep="\n\n### "):  #The prompt format is taken from the official Mixtral huggingface page
    sys_msg= "Take a look at the following instructions and try to follow them."
    p =  "<s> [INST]" + sys_msg +"\n"+ user_query["instruction"] + "[/INST]" +  user_query["output"] + "</s>"
    return p

max_len = 1024

def tokenize(prompt):
    return tokenizer(
        prompt + tokenizer.eos_token,
        truncation=True,
        max_length=max_len,
        padding="max_length"
    )

train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=["instruction" , "output"])

trainer = Trainer(
    model=model,
    train_dataset=train_data,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=6,
        learning_rate=1e-4,
        logging_steps=2,
        optim="adamw_torch",
        save_strategy="epoch"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False

# Train model
trainer.train()
# Save trained model
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

# Push them to the HF Hub
trainer.model.push_to_hub(new_model, use_temp_dir=False, token="")
tokenizer.push_to_hub(new_model, use_temp_dir=False, token="")

# # Format prompt
# message = [
#     "Türkiye'nin başkenti neresidir?"
# ]
# tokenizer = AutoTokenizer.from_pretrained(new_model)
# prompt = tokenizer(message, return_tensors="pt", padding=True)
# # Generate output
# output = trainer.model.generate(
#     input_ids=prompt.input_ids,
#     attention_mask=prompt.attention_mask,
#     max_length=128,
#     do_sample=True,
#     top_p=0.95,
#     top_k=60,
#     num_return_sequences=1,
# )
# # Print output
# print(tokenizer.batch_decode(output, skip_special_tokens=True))