Spaces:
Sleeping
Sleeping
import os | |
import fitz # PyMuPDF | |
import requests | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import Chroma | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.docstore.document import Document | |
CHROMA_DIR = os.path.abspath("chroma") | |
print("π Loading vectorstore from:", CHROMA_DIR) | |
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
# Set this to your actual file on HF | |
HF_FILE_URL = "https://huggingface.co/spaces/DurgaDeepak/eat2fit/resolve/main/meal_plans/Lafayette%2C%20Natasha%20-%20Fit%20By%20Tasha%20High%20Protein%20Recipes%20_%2052%20High%20Protein%20Clean%20Recipes%20%26%20Meal%20Plan%20(2021).pdf" | |
def ensure_pdf_downloaded(local_path: str, url: str): | |
if not os.path.exists(local_path): | |
print(f"Downloading large PDF from: {url}") | |
response = requests.get(url) | |
if response.status_code == 200: | |
with open(local_path, "wb") as f: | |
f.write(response.content) | |
print("PDF downloaded successfully.") | |
else: | |
raise RuntimeError(f"Failed to download PDF: {response.status_code}") | |
def load_and_chunk_pdfs(folder_path): | |
documents = [] | |
for filename in os.listdir(folder_path): | |
if filename.endswith(".pdf"): | |
path = os.path.join(folder_path, filename) | |
# Try downloading the file if it's missing or an LFS pointer | |
if os.path.getsize(path) < 1000: # LFS pointer files are tiny | |
ensure_pdf_downloaded(path, HF_FILE_URL) | |
doc = fitz.open(path) | |
text = "\n".join(page.get_text() for page in doc if page.get_text()) | |
documents.append(Document(page_content=text, metadata={"source": filename})) | |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
chunks = splitter.split_documents(documents) | |
return chunks | |
def create_vectorstore(chunks): | |
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
db = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR) | |
return db | |
def load_vectorstore(): | |
print("π Loading from:", CHROMA_DIR) | |
embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) | |
db = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings) | |
# Debug block | |
try: | |
docs = db.get() | |
print(f"β Loaded vectorstore with {len(docs['documents'])} docs") | |
print(f"π§Ύ First doc snippet: {docs['documents'][0][:100]}...") | |
except Exception as e: | |
print(f"β Vectorstore load error: {e}") | |
return db |