Spaces:

aleksandrrnt
/

gradio

Sleeping

App Files Files Community

gradio / local_loader.py

aleksandrrnt

Upload 50 files

ee8fb16 verified 12 months ago

raw

history blame contribute delete

2.84 kB

	import os
	import json
	from pathlib import Path

	from pypdf import PdfReader
	from langchain.docstore.document import Document
	from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader
	from langchain_community.document_loaders.excel import UnstructuredExcelLoader

	def load_file(filepath):
	# try:
	print(f"Loading {filepath}")
	if filepath.suffix == '.txt':
	loader = TextLoader(str(filepath))
	return loader.load()
	elif filepath.suffix == '.csv':
	loader = CSVLoader(file_path=str(filepath))
	return loader.load()
	elif filepath.suffix == '.pdf':
	loader = PyPDFLoader(str(filepath))
	return loader.load()
	elif filepath.suffix == '.md':
	# Load Markdown file as a Document using TextLoader
	loader = TextLoader(str(filepath))
	return loader.load()
	elif filepath.suffix == '.xls' or filepath.suffix == '.xlsx':
	loader = UnstructuredExcelLoader(str(filepath))
	return loader.load()
	elif filepath.suffix == '.json':
	with open(filepath) as f:
	json_data = json.load(f)
	if isinstance(json_data, list): # Handle list of dictionaries
	for item in json_data:
	content = "\n".join([f"{k}: {v}" for k, v in item.items()])
	return [Document(page_content=content, metadata={'source': str(filepath)})]
	elif isinstance(json_data, dict): # Handle nested dictionaries
	content = ""
	for key, value in json_data.items():
	content += f"{key}\n\n"
	if isinstance(value, list):
	for item in value:
	if isinstance(item, dict):
	content += "\n".join([f"{k}: {v}" for k, v in item.items()]) + "\n\n"
	else:
	content += str(item) + "\n\n"
	else:
	content += str(value) + "\n\n"
	return [Document(page_content=content, metadata={'source': str(filepath)})]
	else:
	print(f"Unsupported JSON structure in {filepath}")
	else:
	print(f"Unsupported file type: {filepath}")
	# except Exception as e:
	# print(f"Error loading {filepath}: {e}")

	def load_data_files(data_dir):
	"""
	Loads all data files from the specified directory, handling various file types.

	Args:
	data_dir: The directory containing the data files.

	Returns:
	A list of Document objects, each representing a loaded document.
	"""
	docs = []
	for filepath in Path(data_dir).glob('*/.*'):
	docs.extend(load_file(filepath))
	return docs


	if __name__ == "__main__":
	# Test with files in the 'examples' directory
	docs = load_data_files("examples")
	for doc in docs:
	print(doc)