import os import random import re import gradio as gr import pandas as pd from datasets import Dataset from pypdf import PdfReader from huggingface_hub import HfApi # import template from string import Template to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""] to_be_replaced = { "½": "1/2", "–": "-", "‘": "'", "’": "'", "…": "...", "₋": "-", "−": "-", "⓫": "11.", "⓬": "12.", "⓭": "13.", "⓮": "14.", "◦": "°", "❶": "1.", "❷": "2.", "❸": "3.", "❹": "4.", "❺": "5.", "❻": "6.", "❼": "7.", "❽": "8.", "❾": "9.", "❿": "10.", "\n": " ", } def clean(text): # Remove all the unwanted characters for char in to_be_removed: text = text.replace(char, "") # Replace all the characters that need to be replaced for char, replacement in to_be_replaced.items(): text = text.replace(char, replacement) # For all \n, if the next line doesn't start with a capital letter, remove the \n # text = re.sub(r"\n([^A-ZÀ-ÖØ-Þ])", r" \1", text) # Make sure that every "." is followed by a space text = re.sub(r"\.([^ ])", r". \1", text) # Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents) text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text) # Make sure that there is no space before a comma and a period text = text.replace(" ,", ",") text = text.replace(" .", ".") text = text.replace(" -", "-") text = text.replace("- ", "-") while " " in text: text = text.replace(" ", " ") return text def pdf2dataset(file, progress=gr.Progress()): progress(0, desc="Starting...") reader = PdfReader(file) num_pages = len(reader.pages) dataset_name = f"{random.getrandbits(128):x}" page_texts = [] for page in progress.tqdm(reader.pages, total=num_pages, desc="Converting pages"): page_text = page.extract_text() page_text = clean(page_text) page_texts.append(page_text) progress(0, desc="Uploading to Hugging Face...") dataset = Dataset.from_dict({"text": page_texts}) dataset.push_to_hub(f"pdf2dataset/{dataset_name}", token=os.getenv("TOKEN")) progress(1, desc="Done!") instrctions = instructions_template.substitute(dataset_name=dataset_name) preview = dataset["text"][:10] preview = pd.DataFrame(preview, columns=["text"]) return instrctions, preview, dataset_name def delete_dataset(dataset_name): api = HfApi() if "/" in dataset_name: user_id, dataset_name = dataset_name.split("/") else: user_id = "pdf2dataset" if not user_id == "pdf2dataset": return f"❌ Invalid namespace deteced: {user_id}" repo_id = f"{user_id}/{dataset_name}" try: api.delete_repo(repo_id, repo_type="dataset") return "✅ Dataset deleted successfully." except Exception as e: return f"❌ Error deleting dataset: {e}" caution_text = """⚠️ Caution: - This process will upload your data to a public Hugging Face repository. Do not upload sensitive information. - Anyone (including you) will be able to delete the dataset once it is uploaded. """ instructions_template = Template( """ 🔗: https://huggingface.co/datasets/pdf2dataset/$dataset_name. ```python from datasets import load_dataset dataset = load_dataset("pdf2dataset/$dataset_name") ``` """ ) with gr.Blocks() as demo: gr.Markdown("# PDF to 🤗 Dataset") gr.Markdown("## 1️⃣ Upload a PDF") file = gr.File(file_types=["pdf"], height=50) gr.Markdown(caution_text) gr.Markdown("## 2️⃣ Convert the PDF and upload") convert_button = gr.Button("🔄 Convert and upload") preview = gr.Dataframe(label="Preview (first 10 rows)", headers=["text"], datatype=["str"], row_count=10, wrap=True, height=200) gr.Markdown("## 3️⃣ Use the dataset in your code") instructions = gr.Markdown(instructions_template.substitute(dataset_name="generated_dataset_name")) gr.Markdown("## 4️⃣ Delete the (optional)") dataset_name_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete", label="Dataset to delete") delete_button = gr.Button("🗑️ Delete dataset") # Define the actions convert_button.click(pdf2dataset, inputs=[file], outputs=[instructions, preview, dataset_name_to_delete]) delete_button.click(delete_dataset, inputs=[dataset_name_to_delete], outputs=[delete_button]) dataset_name_to_delete.input(lambda: "🗑️ Delete dataset", outputs=[delete_button]) demo.launch()