Bot / create.py
Batrdj's picture
Upload 4 files
6792445 verified
# import os
# from pathlib import Path
# import cv2
# import pytesseract
# from PIL import Image
# from docx import Document
# from pptx import Presentation
# from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_community.vectorstores import FAISS
# from langchain.schema import Document as LangchainDocument # βœ… Ensure correct Document format
# from dotenv import load_dotenv, find_dotenv
# # Load environment variables
# load_dotenv(find_dotenv())
# # Paths
# DATA_PATH = "data/"
# DB_FAISS_PATH = "vectorstore/db_faiss"
# # Set Tesseract OCR Path (update this based on your installation)
# pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"
# # Step 1: Load Documents from Multiple Sources
# def load_documents(data_path):
# documents = []
# # Load PDFs
# pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
# documents.extend(pdf_loader.load()) # PDFs are already in Document format
# # Load Word files
# for file in Path(data_path).glob("*.docx"):
# doc = Document(file)
# text = "\n".join([para.text for para in doc.paragraphs])
# documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
# # Load PowerPoint files
# for file in Path(data_path).glob("*.pptx"):
# prs = Presentation(file)
# text = ""
# for slide in prs.slides:
# for shape in slide.shapes:
# if hasattr(shape, "text"):
# text += shape.text + "\n"
# documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
# # Load Images (OCR)
# for image_file in Path(data_path).glob("*.jpg"):
# img = cv2.imread(str(image_file))
# text = pytesseract.image_to_string(img)
# documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
# for image_file in Path(data_path).glob("*.png"):
# img = cv2.imread(str(image_file))
# text = pytesseract.image_to_string(img)
# documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
# print(f"βœ… Loaded {len(documents)} documents from {data_path}")
# return documents
# # Step 2: Create Chunks
# def create_chunks(documents):
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
# text_chunks = text_splitter.split_documents(documents)
# print(f"βœ… Created {len(text_chunks)} text chunks")
# return text_chunks
# # Step 3: Create Vector Embeddings
# def get_embedding_model():
# return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# # Step 4: Store embeddings in FAISS
# def create_vector_store(text_chunks):
# embedding_model = get_embedding_model()
# print("πŸ”„ Creating vector store...")
# db = FAISS.from_documents(text_chunks, embedding_model)
# db.save_local(DB_FAISS_PATH)
# print("βœ… Vector store created/updated successfully.")
# # Step 5: Main Execution
# if __name__ == "__main__":
# print("πŸš€ Starting process...")
# documents = load_documents(DATA_PATH)
# text_chunks = create_chunks(documents)
# create_vector_store(text_chunks)
# print("πŸŽ‰ Process completed successfully!")
import os
from pathlib import Path
import cv2
import pytesseract
from PIL import Image
from docx import Document
from pptx import Presentation
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document as LangchainDocument
from dotenv import load_dotenv, find_dotenv
# Load environment variables
load_dotenv(find_dotenv())
# Paths
DATA_PATH = "data/"
DB_FAISS_PATH = "vectorstore/db_faiss"
# Set Tesseract OCR Path (update this based on your installation)
pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"
# Function to extract text from images
def extract_text_from_image(image_path):
img = cv2.imread(str(image_path))
if img is None:
print(f"⚠️ Warning: Unable to read image {image_path}")
return ""
text = pytesseract.image_to_string(img)
return text.strip()
# Step 1: Load Documents from Multiple Sources
def load_documents(data_path):
documents = []
# Load PDFs
pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
documents.extend(pdf_loader.load())
# Load Word files
for file in Path(data_path).glob("*.docx"):
doc = Document(file)
text = "\n".join([para.text for para in doc.paragraphs])
documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
# Load PowerPoint files
for file in Path(data_path).glob("*.pptx"):
prs = Presentation(file)
for i, slide in enumerate(prs.slides):
text = "\n".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
if text.strip():
documents.append(LangchainDocument(page_content=text, metadata={"source": file.name, "slide": i + 1}))
# Load Images (OCR) - JPG and PNG
for image_file in Path(data_path).rglob("*.jpg"):
text = extract_text_from_image(image_file)
if text:
documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
for image_file in Path(data_path).rglob("*.png"):
text = extract_text_from_image(image_file)
if text:
documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
print(f"βœ… Loaded {len(documents)} documents from {data_path}")
return documents
# Step 2: Create Chunks
def create_chunks(documents):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_chunks = text_splitter.split_documents(documents)
print(f"βœ… Created {len(text_chunks)} text chunks")
return text_chunks
# Step 3: Create Vector Embeddings
def get_embedding_model():
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Step 4: Store embeddings in FAISS
def create_vector_store(text_chunks):
embedding_model = get_embedding_model()
print("πŸ”„ Creating vector store...")
db = FAISS.from_documents(text_chunks, embedding_model)
db.save_local(DB_FAISS_PATH)
print("βœ… Vector store created/updated successfully.")
# Step 5: Main Execution
if __name__ == "__main__":
print("πŸš€ Starting process...")
documents = load_documents(DATA_PATH)
text_chunks = create_chunks(documents)
create_vector_store(text_chunks)
print("πŸŽ‰ Process completed successfully!")