DurgaDeepak commited on
Commit
f13a234
·
verified ·
1 Parent(s): e05063c

Create ingestion.py

Browse files
Files changed (1) hide show
  1. ingestion.py +41 -0
ingestion.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ from datasets import Dataset
4
+ from unstructured.partition.pdf import partition_pdf
5
+ from transformers import RagTokenizer
6
+
7
+ def ingest_and_push(dataset_name="username/mealplan-chunks"):
8
+ # Initialize tokenizer for token-aware splitting
9
+ tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
10
+ texts, sources, pages = [], [], []
11
+
12
+ for pdf_path in glob.glob("pdfs/*.pdf"):
13
+ book = os.path.basename(pdf_path)
14
+ pages_data = partition_pdf(filename=pdf_path)
15
+ for page_num, page in enumerate(pages_data, start=1):
16
+ # Encode page text into token windows
17
+ enc = tokenizer(
18
+ page.text,
19
+ max_length=800,
20
+ truncation=True,
21
+ return_overflowing_tokens=True,
22
+ stride=50,
23
+ return_tensors="pt"
24
+ )
25
+ # Decode each token window back to text chunk
26
+ for token_ids in enc["input_ids"]:
27
+ chunk = tokenizer.decode(token_ids, skip_special_tokens=True)
28
+ texts.append(chunk)
29
+ sources.append(book)
30
+ pages.append(page_num)
31
+
32
+ # Build HF Dataset
33
+ ds = Dataset.from_dict({
34
+ "text": texts,
35
+ "source": sources,
36
+ "page": pages
37
+ })
38
+ ds.push_to_hub(dataset_name, token=True)
39
+
40
+ if __name__ == "__main__":
41
+ ingest_and_push()