File size: 1,463 Bytes
83686b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss


check_point = 'nomic-ai/nomic-embed-text-v1'
embedding_model = SentenceTransformer(check_point,trust_remote_code=True)

def parese_doc(doc,first_section,ignore_after):
    documents_1 = ''

    reader = doc
    for page in reader.pages:
        documents_1 += page.extract_text()
    
    cleaned_string = documents_1.replace('\n', ' ')
    cleaned_string = cleaned_string.lower()

    start_index = cleaned_string.find(first_section)
    end_index = cleaned_string.rfind(ignore_after)
    if start_index!=-1 and end_index!=-1:
        cleaned_string = cleaned_string[start_index:end_index]

    sentence_list = cleaned_string.split('. ')
    context_list = []
    group_size = 20
    overlap = 5
    i = 0 
    while True:
        group = sentence_list[i:i+group_size]
        text = '. '.join(group)
        context_list.append(text)
        i+=group_size-overlap
        if i>=len(sentence_list):
            break
    return context_list

def get_embeddings(doc):
    model_input = doc
    out =  embedding_model.encode(model_input)
    return out

def create_embedding(context_list):
    embedding_dimension = embedding_model.get_sentence_embedding_dimension()
    embeddings = list(map(get_embeddings,context_list))
    embeddings_array = np.array(embeddings)
    
    index = faiss.IndexFlatL2(embedding_dimension)
    index.add(embeddings_array)
    return index