import requests import logging, os from typing import Iterator, List, Union from langchain_core.document_loaders import BaseLoader from langchain_core.documents import Document from open_webui.env import SRC_LOG_LEVELS log = logging.getLogger(__name__) log.setLevel(SRC_LOG_LEVELS["RAG"]) class ExternalDocumentLoader(BaseLoader): def __init__( self, file_path, url: str, api_key: str, mime_type=None, **kwargs, ) -> None: self.url = url self.api_key = api_key self.file_path = file_path self.mime_type = mime_type def load(self) -> List[Document]: with open(self.file_path, "rb") as f: data = f.read() headers = {} if self.mime_type is not None: headers["Content-Type"] = self.mime_type if self.api_key is not None: headers["Authorization"] = f"Bearer {self.api_key}" try: headers["X-Filename"] = os.path.basename(self.file_path) except: pass url = self.url if url.endswith("/"): url = url[:-1] try: response = requests.put(f"{url}/process", data=data, headers=headers) except Exception as e: log.error(f"Error connecting to endpoint: {e}") raise Exception(f"Error connecting to endpoint: {e}") if response.ok: response_data = response.json() if response_data: if isinstance(response_data, dict): return [ Document( page_content=response_data.get("page_content"), metadata=response_data.get("metadata"), ) ] elif isinstance(response_data, list): documents = [] for document in response_data: documents.append( Document( page_content=document.get("page_content"), metadata=document.get("metadata"), ) ) return documents else: raise Exception("Error loading document: Unable to parse content") else: raise Exception("Error loading document: No content returned") else: raise Exception( f"Error loading document: {response.status_code} {response.text}" )