Spaces:
Running
Running
File size: 1,463 Bytes
83686b4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
check_point = 'nomic-ai/nomic-embed-text-v1'
embedding_model = SentenceTransformer(check_point,trust_remote_code=True)
def parese_doc(doc,first_section,ignore_after):
documents_1 = ''
reader = doc
for page in reader.pages:
documents_1 += page.extract_text()
cleaned_string = documents_1.replace('\n', ' ')
cleaned_string = cleaned_string.lower()
start_index = cleaned_string.find(first_section)
end_index = cleaned_string.rfind(ignore_after)
if start_index!=-1 and end_index!=-1:
cleaned_string = cleaned_string[start_index:end_index]
sentence_list = cleaned_string.split('. ')
context_list = []
group_size = 20
overlap = 5
i = 0
while True:
group = sentence_list[i:i+group_size]
text = '. '.join(group)
context_list.append(text)
i+=group_size-overlap
if i>=len(sentence_list):
break
return context_list
def get_embeddings(doc):
model_input = doc
out = embedding_model.encode(model_input)
return out
def create_embedding(context_list):
embedding_dimension = embedding_model.get_sentence_embedding_dimension()
embeddings = list(map(get_embeddings,context_list))
embeddings_array = np.array(embeddings)
index = faiss.IndexFlatL2(embedding_dimension)
index.add(embeddings_array)
return index
|