Spaces:
Sleeping
Sleeping
from pathlib import Path | |
from langchain_community.document_loaders import ( | |
PyPDFLoader, | |
TextLoader, | |
PythonLoader, | |
NotebookLoader, | |
) | |
import pickle | |
def main(): | |
BASE_DIR = Path(__file__).resolve().parent.parent | |
DATA_DIR = BASE_DIR / "data" | |
OUTPUT_DIR = BASE_DIR / "output" | |
OUTPUT_PATH = OUTPUT_DIR / "all_docs.pkl" | |
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) | |
loaders = { | |
".pdf": PyPDFLoader, | |
".txt": lambda path: TextLoader(path, encoding="utf-8"), | |
".py": PythonLoader, | |
".ipynb": NotebookLoader, | |
} | |
documents = [] | |
for file in DATA_DIR.rglob("*"): | |
loader_class = loaders.get(file.suffix.lower()) | |
if loader_class: | |
try: | |
docs = loader_class(str(file)).load() | |
documents.extend(docs) | |
print(f"[β] Loaded: {file.name}") | |
except Exception as e: | |
print(f"[!] Failed to load {file.name}: {e}") | |
with open(OUTPUT_PATH, "wb") as f: | |
pickle.dump(documents, f) | |
print(f"π¦ Saved {len(documents)} documents to {OUTPUT_PATH}") | |
if __name__ == "__main__": | |
main() | |