Spaces:
Sleeping
Sleeping
import os | |
import json | |
from pathlib import Path | |
from pypdf import PdfReader | |
from langchain.docstore.document import Document | |
from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader | |
from langchain_community.document_loaders.excel import UnstructuredExcelLoader | |
def load_file(filepath): | |
# try: | |
print(f"Loading {filepath}") | |
if filepath.suffix == '.txt': | |
loader = TextLoader(str(filepath)) | |
return loader.load() | |
elif filepath.suffix == '.csv': | |
loader = CSVLoader(file_path=str(filepath)) | |
return loader.load() | |
elif filepath.suffix == '.pdf': | |
loader = PyPDFLoader(str(filepath)) | |
return loader.load() | |
elif filepath.suffix == '.md': | |
# Load Markdown file as a Document using TextLoader | |
loader = TextLoader(str(filepath)) | |
return loader.load() | |
elif filepath.suffix == '.xls' or filepath.suffix == '.xlsx': | |
loader = UnstructuredExcelLoader(str(filepath)) | |
return loader.load() | |
elif filepath.suffix == '.json': | |
with open(filepath) as f: | |
json_data = json.load(f) | |
if isinstance(json_data, list): # Handle list of dictionaries | |
for item in json_data: | |
content = "\n".join([f"{k}: {v}" for k, v in item.items()]) | |
return [Document(page_content=content, metadata={'source': str(filepath)})] | |
elif isinstance(json_data, dict): # Handle nested dictionaries | |
content = "" | |
for key, value in json_data.items(): | |
content += f"**{key}**\n\n" | |
if isinstance(value, list): | |
for item in value: | |
if isinstance(item, dict): | |
content += "\n".join([f"{k}: {v}" for k, v in item.items()]) + "\n\n" | |
else: | |
content += str(item) + "\n\n" | |
else: | |
content += str(value) + "\n\n" | |
return [Document(page_content=content, metadata={'source': str(filepath)})] | |
else: | |
print(f"Unsupported JSON structure in {filepath}") | |
else: | |
print(f"Unsupported file type: {filepath}") | |
# except Exception as e: | |
# print(f"Error loading {filepath}: {e}") | |
def load_data_files(data_dir): | |
""" | |
Loads all data files from the specified directory, handling various file types. | |
Args: | |
data_dir: The directory containing the data files. | |
Returns: | |
A list of Document objects, each representing a loaded document. | |
""" | |
docs = [] | |
for filepath in Path(data_dir).glob('**/*.*'): | |
docs.extend(load_file(filepath)) | |
return docs | |
if __name__ == "__main__": | |
# Test with files in the 'examples' directory | |
docs = load_data_files("examples") | |
for doc in docs: | |
print(doc) |