File size: 2,406 Bytes
013e703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66ff764
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2dc765b
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# 读取文件,解析JSON
import json
import os
from datasets import Dataset

training_file_path = os.path.join(".", "training_data.json")
print("file path:", training_file_path)

training_data_json = {}
with open(training_file_path, "r", encoding='utf-8') as training_file:
    training_data_json = json.load(training_file)

# 转载数据集,划分训练/评估/测试
from datasets import Dataset

training_dataset = Dataset.from_list(training_data_json)

from datasets import DatasetDict

train_test_split = training_dataset.train_test_split(test_size=0.2)
dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print(dataset)

# 选择模型和Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-1.5B-Instruct" # 选择一个预训练模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

print("model loaded")

# 数据预处理
def preprocess_function(examples):
    inputs = [f"自然语言查询: {q}\nSQL查询: " for q in examples['natural_language_query']]
    targets = [q for q in examples['sql_query']]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=256, truncation=True, padding='max_length').input_ids
    model_inputs["labels"] = labels
    return model_inputs

encoded_dataset = dataset.map(preprocess_function, batched=True)

print("data pre-processed")

# 配置训练参数
from transformers import TrainingArguments, Trainer

fine_tuned_dir = os.path.join("..", "fine-tuned-models")

training_args = TrainingArguments(
    output_dir=fine_tuned_dir,
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=500,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
)

print("training parameter set")

# 启动微调
trainer.train()

print("training completed")

# 评估模型
results = trainer.evaluate()
print(results)

# 保存模型
trainer.save_model('./fine_tuned_model')
print("trained model saved")