boryasbora commited on
Commit
9514ca1
·
verified ·
1 Parent(s): cea573b

Create create_retriever.py

Browse files
Files changed (1) hide show
  1. create_retriever.py +56 -0
create_retriever.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.schema import Document
2
+ import pickle
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_openai import OpenAIEmbeddings
5
+ from langchain_chroma import Chroma
6
+ from langchain.retrievers import ParentDocumentRetriever
7
+ from langchain.storage import InMemoryStore
8
+ import os
9
+ from typing import Iterable
10
+ import json
11
+ from tqdm import tqdm
12
+ from langchain_huggingface import HuggingFaceEmbeddings
13
+ embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
14
+
15
+ def parent_retriever(chroma_path, embeddings):
16
+ parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,
17
+ chunk_overlap=500)
18
+
19
+ # create the child documents - The small chunks
20
+ child_splitter = RecursiveCharacterTextSplitter(chunk_size=300,
21
+ chunk_overlap=50)
22
+
23
+ # The storage layer for the parent chunks
24
+ store = InMemoryStore()
25
+
26
+ vectorstore = Chroma(collection_name="full_documents",
27
+ embedding_function=embeddings,
28
+ persist_directory=chroma_path)
29
+ retriever = ParentDocumentRetriever(
30
+ vectorstore=vectorstore,
31
+ docstore=store,
32
+ child_splitter=child_splitter,
33
+ parent_splitter=parent_splitter,
34
+ search_kwargs={"k": 5})
35
+ return retriever
36
+
37
+ def save_to_pickle(obj, filename):
38
+ '''
39
+ save docstore as pickle file
40
+ '''
41
+ with open(filename, "wb") as file:
42
+ pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)
43
+
44
+ retriever_repos = parent_retriever('ohw_proj_chorma_db',embeddings=embedding)
45
+ def load_docs_from_jsonl(file_path)->Iterable[Document]:
46
+ array = []
47
+ with open(file_path, 'r') as jsonl_file:
48
+ for line in jsonl_file:
49
+ data = json.loads(line)
50
+ obj = Document(**data)
51
+ array.append(obj)
52
+ return array
53
+ documents = load_docs_from_jsonl('project_readmes.json')
54
+ for i in tqdm(range(0,len(documents))):
55
+ retriever_repos.add_documents([documents[i]])
56
+ save_to_pickle(retriever_repos.docstore.store, 'ohw_proj_chorma_db.pcl')