File size: 2,779 Bytes
668bf5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForSeq2Seq
from datasets import load_dataset

from model_load import model
from tokenizer import tokenizer, tokenize   
from data_load import preprocess
from utils.utils import inspect_tokenized_dataset , print_label_lengths , print_field_lengths
from loss.trainer import MyTrainer

import torch

# βœ… νŒ¨λ”© 토큰 확인
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print(f"[디버깅] pad_token 좔가됨 β†’ {tokenizer.pad_token}")

# βœ… λͺ¨λΈ ν† ν¬λ‚˜μ΄μ € 크기 μ‘°μ •
model.resize_token_embeddings(len(tokenizer))
print(f"[디버깅] λͺ¨λΈ μž„λ² λ”© 크기 μž¬μ‘°μ • μ™„λ£Œ β†’ {len(tokenizer)}")

# βœ… 데이터 λ‘œλ“œ
dataset_path = "yahma/alpaca-cleaned"  # λ˜λŠ” "./my_dataset.json"
if dataset_path.endswith(".json") or dataset_path.endswith(".jsonl"):
    raw_data = load_dataset("json", data_files=dataset_path, split="train")
else:
    raw_data = load_dataset(dataset_path, split="train")

# βœ… 데이터 μ „μ²˜λ¦¬
processed_dataset = preprocess(raw_data)
print_field_lengths(processed_dataset, stage="μ „μ²˜λ¦¬ ν›„")

# βœ… ν† ν¬λ‚˜μ΄μ§•
tokenized_dataset = processed_dataset.map(tokenize, batched=True, remove_columns=processed_dataset.column_names)
print_field_lengths(tokenized_dataset, stage="ν† ν¬λ‚˜μ΄μ§• ν›„")

# βœ… ν† μΉ˜ ν…μ„œ ν˜•μ‹μœΌλ‘œ λ³€ν™˜
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print(f"[디버깅] ν…μ„œ 포맷으둜 μ„€μ • μ™„λ£Œ")
print_field_lengths(tokenized_dataset, stage="ν…μ„œ 포맷 ν›„")

# βœ… 각 ν•„λ“œ νƒ€μž… 확인
assert isinstance(tokenized_dataset[0]["input_ids"], torch.Tensor), "input_idsκ°€ Tensorκ°€ μ•„λ‹™λ‹ˆλ‹€"
assert isinstance(tokenized_dataset[0]["labels"], torch.Tensor), "labelsκ°€ Tensorκ°€ μ•„λ‹™λ‹ˆλ‹€"

# βœ… inspect_tokenized_dataset μ‹€ν–‰
# inspect_tokenized_dataset(tokenized_dataset)

# βœ… TrainingArguments μ„€μ •
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_dir="./logs",
    report_to="none",
    deepspeed="ds_config.json",
    save_total_limit=1,
    save_strategy="epoch",
    fp16=True,
)
print("[디버깅] TrainingArguments μ„€μ • μ™„λ£Œ")

# βœ… Trainer μ„€μ •
trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    # data_collator=data_collator,  # ν•„μš”μ‹œ ν™œμ„±ν™”
)
print("[디버깅] Trainer μΈμŠ€ν„΄μŠ€ 생성 μ™„λ£Œ")

# βœ… ν•™μŠ΅ μ‹œμž‘
print("[디버깅] ν•™μŠ΅ μ‹œμž‘")
trainer.train()