Spaces:
No application file
No application file
File size: 2,779 Bytes
668bf5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForSeq2Seq
from datasets import load_dataset
from model_load import model
from tokenizer import tokenizer, tokenize
from data_load import preprocess
from utils.utils import inspect_tokenized_dataset , print_label_lengths , print_field_lengths
from loss.trainer import MyTrainer
import torch
# β
ν¨λ© ν ν° νμΈ
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print(f"[λλ²κΉ
] pad_token μΆκ°λ¨ β {tokenizer.pad_token}")
# β
λͺ¨λΈ ν ν¬λμ΄μ ν¬κΈ° μ‘°μ
model.resize_token_embeddings(len(tokenizer))
print(f"[λλ²κΉ
] λͺ¨λΈ μλ² λ© ν¬κΈ° μ¬μ‘°μ μλ£ β {len(tokenizer)}")
# β
λ°μ΄ν° λ‘λ
dataset_path = "yahma/alpaca-cleaned" # λλ "./my_dataset.json"
if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"):
raw_data = load_dataset("json", data_files=dataset_path, split="train")
else:
raw_data = load_dataset(dataset_path, split="train")
# β
λ°μ΄ν° μ μ²λ¦¬
processed_dataset = preprocess(raw_data)
print_field_lengths(processed_dataset, stage="μ μ²λ¦¬ ν")
# β
ν ν¬λμ΄μ§
tokenized_dataset = processed_dataset.map(tokenize, batched=True, remove_columns=processed_dataset.column_names)
print_field_lengths(tokenized_dataset, stage="ν ν¬λμ΄μ§ ν")
# β
ν μΉ ν
μ νμμΌλ‘ λ³ν
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print(f"[λλ²κΉ
] ν
μ ν¬λ§·μΌλ‘ μ€μ μλ£")
print_field_lengths(tokenized_dataset, stage="ν
μ ν¬λ§· ν")
# β
κ° νλ νμ
νμΈ
assert isinstance(tokenized_dataset[0]["input_ids"], torch.Tensor), "input_idsκ° Tensorκ° μλλλ€"
assert isinstance(tokenized_dataset[0]["labels"], torch.Tensor), "labelsκ° Tensorκ° μλλλ€"
# β
inspect_tokenized_dataset μ€ν
# inspect_tokenized_dataset(tokenized_dataset)
# β
TrainingArguments μ€μ
training_args = TrainingArguments(
output_dir="./output",
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
num_train_epochs=3,
logging_dir="./logs",
report_to="none",
deepspeed="ds_config.json",
save_total_limit=1,
save_strategy="epoch",
fp16=True,
)
print("[λλ²κΉ
] TrainingArguments μ€μ μλ£")
# β
Trainer μ€μ
trainer = MyTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
tokenizer=tokenizer,
# data_collator=data_collator, # νμμ νμ±ν
)
print("[λλ²κΉ
] Trainer μΈμ€ν΄μ€ μμ± μλ£")
# β
νμ΅ μμ
print("[λλ²κΉ
] νμ΅ μμ")
trainer.train()
|