import os import logging from langchain_community.retrievers import BM25Retriever, TavilySearchAPIRetriever from langchain.retrievers import EnsembleRetriever from langchain_core.output_parsers import StrOutputParser from basic_chain import get_model from rag_chain import make_rag_chain from remote_loader import load_web_page from splitter import split_documents from vector_store import create_vector_db from dotenv import load_dotenv def ensemble_retriever_from_docs(docs, embeddings=None): texts = split_documents(docs) vs = create_vector_db(texts, embeddings) vs_retriever = vs.as_retriever() bm25_retriever = BM25Retriever.from_texts([t.page_content for t in texts]) # tavily_retriever = TavilySearchAPIRetriever(k=3, include_domains=['https://ilibrary.ru/text/107']) tavily_retriever = MyTavilySearchAPIRetriever(k=3, include_domains=['https://equitygroupholdings.com']) ensemble_retriever = EnsembleRetriever( retrievers=[bm25_retriever, vs_retriever, tavily_retriever], weights=[0.5, 0.5, 0.5]) return ensemble_retriever class MyTavilySearchAPIRetriever(TavilySearchAPIRetriever): def _get_relevant_documents( self, query: str, *, run_manager ): try: return super()._get_relevant_documents(query, run_manager=run_manager) except Exception as e: logging.error(f"TavilySearch error: {e}") return [] def main(): load_dotenv() problems_of_philosophy_by_russell = "https://www.gutenberg.org/ebooks/5827.html.images" docs = load_web_page(problems_of_philosophy_by_russell) ensemble_retriever = ensemble_retriever_from_docs(docs) model = get_model("ChatGPT") chain = make_rag_chain(model, ensemble_retriever) | StrOutputParser() result = chain.invoke("What are the key problems of philosophy according to Russell?") print(result) if __name__ == "__main__": # this is to quite parallel tokenizers warning. os.environ["TOKENIZERS_PARALLELISM"] = "false" main()