Spaces:

sedataskan
/

TurkGPT

Runtime error

App Files Files Community

TurkGPT / app.py

sedataskan

Update app.py

093e2ac verified over 1 year ago

raw

history blame contribute delete

3.33 kB

	import torch
	import transformers
	import bitsandbytes
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
	from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

	model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
	new_model = "sedataskan/mistral8x7B-finetuned"

	tokenizer = AutoTokenizer.from_pretrained(model_name)

	model = AutoModelForCausalLM.from_pretrained(model_name,
	load_in_4bit=False,
	torch_dtype=torch.float16,
	device_map="auto")
	tokenizer.pad_token = "!"

	LORA_R = 8
	LORA_ALPHA = 2 * LORA_R
	LORA_DROPOUT = 0.1

	config = LoraConfig(
	r=LORA_R,
	lora_alpha=LORA_ALPHA,
	target_modules=[ "w1", "w2", "w3"], # Only Training the "expert" layers
	lora_dropout=LORA_DROPOUT,
	bias="none",
	task_type="CAUSAL_LM"
	)

	model = get_peft_model(model, config)

	def print_trainable_parameters(m):
	trainable_params = sum(p.numel() for p in m.parameters() if p.requires_grad)
	all_params = sum(p.numel() for p in m.parameters())
	print(f"trainable params: {trainable_params} \|\| all params: {all_params} \|\| trainable%: {100 * trainable_params / all_params}")

	print_trainable_parameters(model)

	train_data = load_dataset("oscar-corpus/OSCAR-2201", "tr")
	print("Dataset", train_data)

	def generate_prompt(user_query, sep="\n\n### "): #The prompt format is taken from the official Mixtral huggingface page
	sys_msg= "Take a look at the following instructions and try to follow them."
	p = "<s> [INST]" + sys_msg +"\n"+ user_query["instruction"] + "[/INST]" + user_query["output"] + "</s>"
	return p

	max_len = 1024

	def tokenize(prompt):
	return tokenizer(
	prompt + tokenizer.eos_token,
	truncation=True,
	max_length=max_len,
	padding="max_length"
	)

	train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=["instruction" , "output"])

	trainer = Trainer(
	model=model,
	train_dataset=train_data,
	args=TrainingArguments(
	per_device_train_batch_size=1,
	gradient_accumulation_steps=4,
	num_train_epochs=6,
	learning_rate=1e-4,
	logging_steps=2,
	optim="adamw_torch",
	save_strategy="epoch"
	),
	data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
	)
	model.config.use_cache = False

	# Train model
	trainer.train()
	# Save trained model
	trainer.model.save_pretrained(new_model)
	tokenizer.save_pretrained(new_model)

	# Push them to the HF Hub
	trainer.model.push_to_hub(new_model, use_temp_dir=False, token="")
	tokenizer.push_to_hub(new_model, use_temp_dir=False, token="")

	# # Format prompt
	# message = [
	# "Türkiye'nin başkenti neresidir?"
	# ]
	# tokenizer = AutoTokenizer.from_pretrained(new_model)
	# prompt = tokenizer(message, return_tensors="pt", padding=True)
	# # Generate output
	# output = trainer.model.generate(
	# input_ids=prompt.input_ids,
	# attention_mask=prompt.attention_mask,
	# max_length=128,
	# do_sample=True,
	# top_p=0.95,
	# top_k=60,
	# num_return_sequences=1,
	# )
	# # Print output
	# print(tokenizer.batch_decode(output, skip_special_tokens=True))