changgyu's picture
Upload 19 files
668bf5d verified
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForSeq2Seq
from datasets import load_dataset
from model_load import model
from tokenizer import tokenizer, tokenize
from data_load import preprocess
from utils.utils import inspect_tokenized_dataset , print_label_lengths , print_field_lengths
from loss.trainer import MyTrainer
import torch
# βœ… νŒ¨λ”© 토큰 확인
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print(f"[디버깅] pad_token 좔가됨 β†’ {tokenizer.pad_token}")
# βœ… λͺ¨λΈ ν† ν¬λ‚˜μ΄μ € 크기 μ‘°μ •
model.resize_token_embeddings(len(tokenizer))
print(f"[디버깅] λͺ¨λΈ μž„λ² λ”© 크기 μž¬μ‘°μ • μ™„λ£Œ β†’ {len(tokenizer)}")
# βœ… 데이터 λ‘œλ“œ
dataset_path = "yahma/alpaca-cleaned" # λ˜λŠ” "./my_dataset.json"
if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"):
raw_data = load_dataset("json", data_files=dataset_path, split="train")
else:
raw_data = load_dataset(dataset_path, split="train")
# βœ… 데이터 μ „μ²˜λ¦¬
processed_dataset = preprocess(raw_data)
print_field_lengths(processed_dataset, stage="μ „μ²˜λ¦¬ ν›„")
# βœ… ν† ν¬λ‚˜μ΄μ§•
tokenized_dataset = processed_dataset.map(tokenize, batched=True, remove_columns=processed_dataset.column_names)
print_field_lengths(tokenized_dataset, stage="ν† ν¬λ‚˜μ΄μ§• ν›„")
# βœ… ν† μΉ˜ ν…μ„œ ν˜•μ‹μœΌλ‘œ λ³€ν™˜
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print(f"[디버깅] ν…μ„œ 포맷으둜 μ„€μ • μ™„λ£Œ")
print_field_lengths(tokenized_dataset, stage="ν…μ„œ 포맷 ν›„")
# βœ… 각 ν•„λ“œ νƒ€μž… 확인
assert isinstance(tokenized_dataset[0]["input_ids"], torch.Tensor), "input_idsκ°€ Tensorκ°€ μ•„λ‹™λ‹ˆλ‹€"
assert isinstance(tokenized_dataset[0]["labels"], torch.Tensor), "labelsκ°€ Tensorκ°€ μ•„λ‹™λ‹ˆλ‹€"
# βœ… inspect_tokenized_dataset μ‹€ν–‰
# inspect_tokenized_dataset(tokenized_dataset)
# βœ… TrainingArguments μ„€μ •
training_args = TrainingArguments(
output_dir="./output",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
num_train_epochs=3,
logging_dir="./logs",
report_to="none",
deepspeed="ds_config.json",
save_total_limit=1,
save_strategy="epoch",
fp16=True,
)
print("[디버깅] TrainingArguments μ„€μ • μ™„λ£Œ")
# βœ… Trainer μ„€μ •
trainer = MyTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
tokenizer=tokenizer,
# data_collator=data_collator, # ν•„μš”μ‹œ ν™œμ„±ν™”
)
print("[디버깅] Trainer μΈμŠ€ν„΄μŠ€ 생성 μ™„λ£Œ")
# βœ… ν•™μŠ΅ μ‹œμž‘
print("[디버깅] ν•™μŠ΅ μ‹œμž‘")
trainer.train()