Spaces:
Sleeping
Sleeping
import os | |
import glob | |
import faiss | |
import numpy as np | |
from datasets import Dataset | |
from unstructured.partition.pdf import partition_pdf | |
from transformers import RagTokenizer | |
from sentence_transformers import SentenceTransformer | |
def ingest_and_push( | |
dataset_name="username/mealplan-chunks", | |
index_path="mealplan.index" | |
): | |
# 1) Tokenizer for chunking | |
rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") | |
# 2) Embedder for FAISS | |
embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
texts, sources, pages = [], [], [] | |
# 3) Chunk each PDF | |
for pdf_path in glob.glob("meal_plans/*.pdf"): | |
book = os.path.basename(pdf_path) | |
pages_data = partition_pdf(filename=pdf_path) | |
for pg_num, page in enumerate(pages_data, start=1): | |
enc = rag_tokenizer( | |
page.text, | |
max_length=800, | |
truncation=True, | |
return_overflowing_tokens=True, | |
stride=50, | |
return_tensors="pt" | |
) | |
for token_ids in enc["input_ids"]: | |
chunk = rag_tokenizer.decode(token_ids, skip_special_tokens=True) | |
texts.append(chunk) | |
sources.append(book) | |
pages.append(pg_num) | |
# 4) Build HF Dataset | |
ds = Dataset.from_dict({ | |
"text": texts, | |
"source": sources, | |
"page": pages | |
}) | |
ds.push_to_hub(dataset_name, token=True) | |
# 5) Build FAISS index | |
embeddings = embedder.encode(texts, convert_to_numpy=True) | |
dim = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dim) # CPU index | |
index.add(embeddings) | |
faiss.write_index(index, index_path) | |
if __name__ == "__main__": | |
ingest_and_push() | |