training_data_chat / training.py
kain183's picture
new chat
6c1e91d
import sys
from langchain.vectorstores import FAISS
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
import pickle
import faiss
def train(files_path):
# trainingData = list(Path("training/facts/").glob("**/*.*"))
trainingData = list(Path(files_path).glob("**/*.*"))
if len(trainingData) < 1:
print("The folder training/facts should be populated with at least one .txt or .md file.", file=sys.stderr)
return
data = []
for training in trainingData:
with open(training, "r", encoding='utf-8') as f:
print(f"Add {f.name} to dataset")
data.append(f.read())
textSplitter = CharacterTextSplitter(chunk_size=1000, separator="\n", chunk_overlap=0)
docs = []
for sets in data:
docs.extend(textSplitter.split_text(sets))
store1 = FAISS.from_texts(docs, OpenAIEmbeddings())
faiss.write_index(store1.index, "after_training/training.index")
store1.index = None
with open("after_training/faiss.pkl", "wb") as f:
pickle.dump(store1, f)
return "训练完成"