from langchain_text_splitters import RecursiveCharacterTextSplitter import chromadb from openai import OpenAI import pypdf import uuid import os # for chromaDB __import__("pysqlite3") import sys sys.modules["sqlite3"] = sys.modules.pop("pysqlite3") VECTOR_NAME = "database" EMBEDDING_MODEL = "togethercomputer/m2-bert-80M-2k-retrieval" CHROMA_PATH = "chroma_storage" api_key = os.getenv("TOGETHER_API") ai_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1") def extract_pdf(pdf_path: str) -> str: text = "" with open(pdf_path, "rb") as file: reader = pypdf.PdfReader(file) for page_num in range(len(reader.pages)): page = reader.pages[page_num] text += page.extract_text() text += "\n--PAGE BREAK--\n" return text def create_vectorDB(): docs_paths = os.listdir(os.getcwd() + "/data_ingetion/firms_report/") complete_text = "" for doc_path in docs_paths: complete_text += extract_pdf( os.getcwd() + "/data_ingetion/firms_report/" + doc_path ) complete_text += "\n\n" splitter = RecursiveCharacterTextSplitter( chunk_size=512, chunk_overlap=84, length_function=len, is_separator_regex=False, ) processed_docs = splitter.split_text(complete_text) db_client = chromadb.PersistentClient(path=CHROMA_PATH) collection = db_client.create_collection(VECTOR_NAME) response = ai_client.embeddings.create(input=processed_docs, model=EMBEDDING_MODEL) embeddings = [item.embedding for item in response.data] unique_ids = [str(uuid.uuid4()) for _ in range(len(embeddings))] collection.add(documents=processed_docs, embeddings=embeddings, ids=unique_ids) return collection.name def get_relevant_chunks(query: str): db_client = chromadb.PersistentClient(path=CHROMA_PATH) found = VECTOR_NAME in [c.name for c in db_client.list_collections()] if found: collection = db_client.get_collection(VECTOR_NAME) else: collection = db_client.get_collection(create_vectorDB()) response = ai_client.embeddings.create(input=query, model=EMBEDDING_MODEL) QE = response.data[0].embedding relevant_chunks = collection.query(query_embeddings=QE, n_results=4) processed = "" for idx, doc in enumerate(relevant_chunks["documents"][0], start=1): processed += f"Chunks number {idx}\n\n" processed += doc + "\n\n" return processed