File size: 1,188 Bytes
6c1e91d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import sys
from langchain.vectorstores import FAISS
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
import pickle
import faiss


def train(files_path):
    # trainingData = list(Path("training/facts/").glob("**/*.*"))
    trainingData = list(Path(files_path).glob("**/*.*"))
    if len(trainingData) < 1:
        print("The folder training/facts should be populated with at least one .txt or .md file.", file=sys.stderr)
        return

    data = []
    for training in trainingData:
        with open(training, "r", encoding='utf-8') as f:
            print(f"Add {f.name} to dataset")
            data.append(f.read())

    textSplitter = CharacterTextSplitter(chunk_size=1000, separator="\n", chunk_overlap=0)

    docs = []
    for sets in data:
        docs.extend(textSplitter.split_text(sets))

    store1 = FAISS.from_texts(docs, OpenAIEmbeddings())
    faiss.write_index(store1.index, "after_training/training.index")
    store1.index = None

    with open("after_training/faiss.pkl", "wb") as f:
        pickle.dump(store1, f)
        return "训练完成"