gradio / local_loader.py
aleksandrrnt's picture
Upload 50 files
ee8fb16 verified
import os
import json
from pathlib import Path
from pypdf import PdfReader
from langchain.docstore.document import Document
from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
def load_file(filepath):
# try:
print(f"Loading {filepath}")
if filepath.suffix == '.txt':
loader = TextLoader(str(filepath))
return loader.load()
elif filepath.suffix == '.csv':
loader = CSVLoader(file_path=str(filepath))
return loader.load()
elif filepath.suffix == '.pdf':
loader = PyPDFLoader(str(filepath))
return loader.load()
elif filepath.suffix == '.md':
# Load Markdown file as a Document using TextLoader
loader = TextLoader(str(filepath))
return loader.load()
elif filepath.suffix == '.xls' or filepath.suffix == '.xlsx':
loader = UnstructuredExcelLoader(str(filepath))
return loader.load()
elif filepath.suffix == '.json':
with open(filepath) as f:
json_data = json.load(f)
if isinstance(json_data, list): # Handle list of dictionaries
for item in json_data:
content = "\n".join([f"{k}: {v}" for k, v in item.items()])
return [Document(page_content=content, metadata={'source': str(filepath)})]
elif isinstance(json_data, dict): # Handle nested dictionaries
content = ""
for key, value in json_data.items():
content += f"**{key}**\n\n"
if isinstance(value, list):
for item in value:
if isinstance(item, dict):
content += "\n".join([f"{k}: {v}" for k, v in item.items()]) + "\n\n"
else:
content += str(item) + "\n\n"
else:
content += str(value) + "\n\n"
return [Document(page_content=content, metadata={'source': str(filepath)})]
else:
print(f"Unsupported JSON structure in {filepath}")
else:
print(f"Unsupported file type: {filepath}")
# except Exception as e:
# print(f"Error loading {filepath}: {e}")
def load_data_files(data_dir):
"""
Loads all data files from the specified directory, handling various file types.
Args:
data_dir: The directory containing the data files.
Returns:
A list of Document objects, each representing a loaded document.
"""
docs = []
for filepath in Path(data_dir).glob('**/*.*'):
docs.extend(load_file(filepath))
return docs
if __name__ == "__main__":
# Test with files in the 'examples' directory
docs = load_data_files("examples")
for doc in docs:
print(doc)