# 读取文件,解析JSON import json import os from datasets import Dataset training_file_path = os.path.join(".", "training_data.json") print("file path:", training_file_path) training_data_json = {} with open(training_file_path, "r", encoding='utf-8') as training_file: training_data_json = json.load(training_file) # 转载数据集,划分训练/评估/测试 from datasets import Dataset training_dataset = Dataset.from_list(training_data_json) from datasets import DatasetDict train_test_split = training_dataset.train_test_split(test_size=0.2) dataset = DatasetDict({ 'train': train_test_split['train'], 'test': train_test_split['test'] }) print(dataset) # 选择模型和Tokenizer from transformers import AutoTokenizer, AutoModelForCausalLM model_name = "Qwen/Qwen2.5-1.5B-Instruct" # 选择一个预训练模型 tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) print("model loaded") # 数据预处理 def preprocess_function(examples): inputs = [f"自然语言查询: {q}\nSQL查询: " for q in examples['natural_language_query']] targets = [q for q in examples['sql_query']] model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length') labels = tokenizer(targets, max_length=256, truncation=True, padding='max_length').input_ids model_inputs["labels"] = labels return model_inputs encoded_dataset = dataset.map(preprocess_function, batched=True) print("data pre-processed") # 配置训练参数 from transformers import TrainingArguments, Trainer fine_tuned_dir = os.path.join("..", "fine-tuned-models") training_args = TrainingArguments( output_dir=fine_tuned_dir, eval_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, logging_dir='./logs', logging_steps=10, save_total_limit=2, save_steps=500, prediction_loss_only=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=encoded_dataset['train'], eval_dataset=encoded_dataset['test'], ) print("training parameter set") # 启动微调 trainer.train() print("training completed") # 评估模型 results = trainer.evaluate() print(results) # 保存模型 trainer.save_model('./fine_tuned_model') print("trained model saved")