from pathlib import Path from langchain_community.document_loaders import ( PyPDFLoader, TextLoader, PythonLoader, NotebookLoader, ) import pickle def main(): BASE_DIR = Path(__file__).resolve().parent.parent DATA_DIR = BASE_DIR / "data" OUTPUT_DIR = BASE_DIR / "output" OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) loaders = { ".pdf": PyPDFLoader, ".txt": lambda path: TextLoader(path, encoding="utf-8"), ".py": PythonLoader, ".ipynb": NotebookLoader, } documents = [] for file in DATA_DIR.rglob("*"): loader_class = loaders.get(file.suffix.lower()) if loader_class: try: docs = loader_class(str(file)).load() documents.extend(docs) print(f"[✓] Loaded: {file.name}") except Exception as e: print(f"[!] Failed to load {file.name}: {e}") with open(OUTPUT_PATH, "wb") as f: pickle.dump(documents, f) print(f"📦 Saved {len(documents)} documents to {OUTPUT_PATH}") if __name__ == "__main__": main()