import os import fitz # PyMuPDF import requests from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.docstore.document import Document CHROMA_DIR = os.path.abspath("chroma") print("๐Ÿ“‚ Loading vectorstore from:", CHROMA_DIR) MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # Set this to your actual file on HF HF_FILE_URL = "https://huggingface.co/spaces/DurgaDeepak/eat2fit/resolve/main/meal_plans/Lafayette%2C%20Natasha%20-%20Fit%20By%20Tasha%20High%20Protein%20Recipes%20_%2052%20High%20Protein%20Clean%20Recipes%20%26%20Meal%20Plan%20(2021).pdf" def ensure_pdf_downloaded(local_path: str, url: str): if not os.path.exists(local_path): print(f"Downloading large PDF from: {url}") response = requests.get(url) if response.status_code == 200: with open(local_path, "wb") as f: f.write(response.content) print("PDF downloaded successfully.") else: raise RuntimeError(f"Failed to download PDF: {response.status_code}") def load_and_chunk_pdfs(folder_path): documents = [] for filename in os.listdir(folder_path): if filename.endswith(".pdf"): path = os.path.join(folder_path, filename) # Try downloading the file if it's missing or an LFS pointer if os.path.getsize(path) < 1000: # LFS pointer files are tiny ensure_pdf_downloaded(path, HF_FILE_URL) doc = fitz.open(path) text = "\n".join(page.get_text() for page in doc if page.get_text()) documents.append(Document(page_content=text, metadata={"source": filename})) splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) chunks = splitter.split_documents(documents) return chunks def create_vectorstore(chunks): embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR) return db def load_vectorstore(): print("๐Ÿ“‚ Loading from:", CHROMA_DIR) embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) db = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings) # Debug block try: docs = db.get() print(f"โœ… Loaded vectorstore with {len(docs['documents'])} docs") print(f"๐Ÿงพ First doc snippet: {docs['documents'][0][:100]}...") except Exception as e: print(f"โŒ Vectorstore load error: {e}") return db