Spaces:
No application file
No application file
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForSeq2Seq | |
from datasets import load_dataset | |
from model_load import model | |
from tokenizer import tokenizer, tokenize | |
from data_load import preprocess | |
from utils.utils import inspect_tokenized_dataset , print_label_lengths , print_field_lengths | |
from loss.trainer import MyTrainer | |
import torch | |
# β ν¨λ© ν ν° νμΈ | |
if tokenizer.pad_token is None: | |
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) | |
print(f"[λλ²κΉ ] pad_token μΆκ°λ¨ β {tokenizer.pad_token}") | |
# β λͺ¨λΈ ν ν¬λμ΄μ ν¬κΈ° μ‘°μ | |
model.resize_token_embeddings(len(tokenizer)) | |
print(f"[λλ²κΉ ] λͺ¨λΈ μλ² λ© ν¬κΈ° μ¬μ‘°μ μλ£ β {len(tokenizer)}") | |
# β λ°μ΄ν° λ‘λ | |
dataset_path = "yahma/alpaca-cleaned" # λλ "./my_dataset.json" | |
if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"): | |
raw_data = load_dataset("json", data_files=dataset_path, split="train") | |
else: | |
raw_data = load_dataset(dataset_path, split="train") | |
# β λ°μ΄ν° μ μ²λ¦¬ | |
processed_dataset = preprocess(raw_data) | |
print_field_lengths(processed_dataset, stage="μ μ²λ¦¬ ν") | |
# β ν ν¬λμ΄μ§ | |
tokenized_dataset = processed_dataset.map(tokenize, batched=True, remove_columns=processed_dataset.column_names) | |
print_field_lengths(tokenized_dataset, stage="ν ν¬λμ΄μ§ ν") | |
# β ν μΉ ν μ νμμΌλ‘ λ³ν | |
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) | |
print(f"[λλ²κΉ ] ν μ ν¬λ§·μΌλ‘ μ€μ μλ£") | |
print_field_lengths(tokenized_dataset, stage="ν μ ν¬λ§· ν") | |
# β κ° νλ νμ νμΈ | |
assert isinstance(tokenized_dataset[0]["input_ids"], torch.Tensor), "input_idsκ° Tensorκ° μλλλ€" | |
assert isinstance(tokenized_dataset[0]["labels"], torch.Tensor), "labelsκ° Tensorκ° μλλλ€" | |
# β inspect_tokenized_dataset μ€ν | |
# inspect_tokenized_dataset(tokenized_dataset) | |
# β TrainingArguments μ€μ | |
training_args = TrainingArguments( | |
output_dir="./output", | |
per_device_train_batch_size=4, | |
gradient_accumulation_steps=4, | |
num_train_epochs=3, | |
logging_dir="./logs", | |
report_to="none", | |
deepspeed="ds_config.json", | |
save_total_limit=1, | |
save_strategy="epoch", | |
fp16=True, | |
) | |
print("[λλ²κΉ ] TrainingArguments μ€μ μλ£") | |
# β Trainer μ€μ | |
trainer = MyTrainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset, | |
tokenizer=tokenizer, | |
# data_collator=data_collator, # νμμ νμ±ν | |
) | |
print("[λλ²κΉ ] Trainer μΈμ€ν΄μ€ μμ± μλ£") | |
# β νμ΅ μμ | |
print("[λλ²κΉ ] νμ΅ μμ") | |
trainer.train() | |