import os import sys import json from llamafactory.train.tuner import run_exp from llamafactory.extras.misc import get_current_device from weclone.utils.config import load_config from weclone.utils.log import logger from weclone.data.clean.strategies import LLMCleaningStrategy def main(): train_config = load_config(arg_type="train_sft") dataset_config = load_config(arg_type="make_dataset") device = get_current_device() if device == "cpu": logger.warning("请注意你正在使用CPU训练,非Mac设备可能会出现问题") cleaner = LLMCleaningStrategy(make_dataset_config=dataset_config) cleaned_data_path = cleaner.clean() if not os.path.exists(cleaned_data_path): logger.error(f"错误:文件 '{cleaned_data_path}' 不存在,请确保数据处理步骤已正确生成该文件。") sys.exit(1) formatted_config = json.dumps(train_config, indent=4, ensure_ascii=False) logger.info(f"微调配置:\n{formatted_config}") run_exp(train_config) if __name__ == "__main__": main()