Spaces:
Sleeping
Sleeping
File size: 2,496 Bytes
6c7823c 0ec0bdb 6c7823c e1f9365 6c7823c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
from openai import OpenAI
import pypdf
import uuid
import os
# for chromaDB
__import__("pysqlite3")
import sys
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
VECTOR_NAME = "database"
EMBEDDING_MODEL = "togethercomputer/m2-bert-80M-2k-retrieval"
CHROMA_PATH = "chroma_storage"
api_key = os.getenv("TOGETHER_API")
ai_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
def extract_pdf(pdf_path: str) -> str:
text = ""
with open(pdf_path, "rb") as file:
reader = pypdf.PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text += page.extract_text()
text += "\n--PAGE BREAK--\n"
return text
def create_vectorDB():
docs_paths = os.listdir(os.getcwd() + "/data_ingetion/firms_report/")
complete_text = ""
for doc_path in docs_paths:
complete_text += extract_pdf(
os.getcwd() + "/data_ingetion/firms_report/" + doc_path
)
complete_text += "\n\n"
splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=84,
length_function=len,
is_separator_regex=False,
)
processed_docs = splitter.split_text(complete_text)
db_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = db_client.create_collection(VECTOR_NAME)
response = ai_client.embeddings.create(input=processed_docs, model=EMBEDDING_MODEL)
embeddings = [item.embedding for item in response.data]
unique_ids = [str(uuid.uuid4()) for _ in range(len(embeddings))]
collection.add(documents=processed_docs, embeddings=embeddings, ids=unique_ids)
return collection.name
def get_relevant_chunks(query: str):
db_client = chromadb.PersistentClient(path=CHROMA_PATH)
found = VECTOR_NAME in [c.name for c in db_client.list_collections()]
if found:
collection = db_client.get_collection(VECTOR_NAME)
else:
collection = db_client.get_collection(create_vectorDB())
response = ai_client.embeddings.create(input=query, model=EMBEDDING_MODEL)
QE = response.data[0].embedding
relevant_chunks = collection.query(query_embeddings=QE, n_results=4)
processed = ""
for idx, doc in enumerate(relevant_chunks["documents"][0], start=1):
processed += f"Chunks number {idx}\n\n"
processed += doc + "\n\n"
return processed
|