import sys from langchain.vectorstores import FAISS from pathlib import Path from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings import pickle import faiss def train(files_path): # trainingData = list(Path("training/facts/").glob("**/*.*")) trainingData = list(Path(files_path).glob("**/*.*")) if len(trainingData) < 1: print("The folder training/facts should be populated with at least one .txt or .md file.", file=sys.stderr) return data = [] for training in trainingData: with open(training, "r", encoding='utf-8') as f: print(f"Add {f.name} to dataset") data.append(f.read()) textSplitter = CharacterTextSplitter(chunk_size=1000, separator="\n", chunk_overlap=0) docs = [] for sets in data: docs.extend(textSplitter.split_text(sets)) store1 = FAISS.from_texts(docs, OpenAIEmbeddings()) faiss.write_index(store1.index, "after_training/training.index") store1.index = None with open("after_training/faiss.pkl", "wb") as f: pickle.dump(store1, f) return "训练完成"