File size: 2,438 Bytes
c14f8f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.docstore.document import Document
import pandas as pd
import os
import glob
from PyPDF2 import PdfReader  # Ensure PyPDF2 is installed

# Define a function to process CSV files
def process_csv_files(csv_files):
    documents = []
    for file_path in csv_files:
        df = pd.read_csv(file_path)
        for _, row in df.iterrows():
            row_content = " ".join(row.astype(str))
            documents.append(Document(page_content=row_content))
    return documents

# Define a function to process PDF files
def process_pdf_files(pdf_files):
    documents = []
    for file_path in pdf_files:
        reader = PdfReader(file_path)
        for page in reader.pages:
            text = page.extract_text()
            if text:  # Only add non-empty text
                documents.append(Document(page_content=text))
    return documents

# Define a function to perform vectorization for CSV and PDF files
def vectorize_documents():
    embeddings = HuggingFaceEmbeddings()

    # Directory containing files
    data_directory = "Data"  # Replace with your folder name
    csv_files = glob.glob(os.path.join(data_directory, "*.csv"))
    pdf_files = glob.glob(os.path.join(data_directory, "*.pdf"))

    # Process CSV and PDF files
    documents = process_csv_files(csv_files) + process_pdf_files(pdf_files)

    # Splitting the text and creating chunks of these documents
    text_splitter = CharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=500
    )

    text_chunks = text_splitter.split_documents(documents)

    # Process text chunks in batches
    batch_size = 5000  # Chroma's batch size limit is 5461, set a slightly smaller size for safety
    for i in range(0, len(text_chunks), batch_size):
        batch = text_chunks[i:i + batch_size]

        # Store the batch in Chroma vector DB
        vectordb = Chroma.from_documents(
            documents=batch,
            embedding=embeddings,
            persist_directory="Vector_db"
        )

    print("Documents Vectorized and saved in VectorDB")

# Expose embeddings if needed
embeddings = HuggingFaceEmbeddings()

# Main guard to prevent execution on import
if __name__ == "__main__":
    vectorize_documents()