gradio / ensemble.py
aleksandrrnt's picture
Upload 10 files
820f884 verified
import os
import logging
from langchain_community.retrievers import BM25Retriever, TavilySearchAPIRetriever
from langchain.retrievers import EnsembleRetriever
from langchain_core.output_parsers import StrOutputParser
from basic_chain import get_model
from rag_chain import make_rag_chain
from remote_loader import load_web_page
from splitter import split_documents
from vector_store import create_vector_db
from dotenv import load_dotenv
def ensemble_retriever_from_docs(docs, embeddings=None):
texts = split_documents(docs)
vs = create_vector_db(texts, embeddings)
vs_retriever = vs.as_retriever()
bm25_retriever = BM25Retriever.from_texts([t.page_content for t in texts])
# tavily_retriever = TavilySearchAPIRetriever(k=3, include_domains=['https://ilibrary.ru/text/107'])
tavily_retriever = MyTavilySearchAPIRetriever(k=3, include_domains=['https://equitygroupholdings.com'])
ensemble_retriever = EnsembleRetriever(
retrievers=[bm25_retriever, vs_retriever, tavily_retriever],
weights=[0.5, 0.5, 0.5])
return ensemble_retriever
class MyTavilySearchAPIRetriever(TavilySearchAPIRetriever):
def _get_relevant_documents(
self, query: str, *, run_manager
):
try:
return super()._get_relevant_documents(query, run_manager=run_manager)
except Exception as e:
logging.error(f"TavilySearch error: {e}")
return []
def main():
load_dotenv()
problems_of_philosophy_by_russell = "https://www.gutenberg.org/ebooks/5827.html.images"
docs = load_web_page(problems_of_philosophy_by_russell)
ensemble_retriever = ensemble_retriever_from_docs(docs)
model = get_model("ChatGPT")
chain = make_rag_chain(model, ensemble_retriever) | StrOutputParser()
result = chain.invoke("What are the key problems of philosophy according to Russell?")
print(result)
if __name__ == "__main__":
# this is to quite parallel tokenizers warning.
os.environ["TOKENIZERS_PARALLELISM"] = "false"
main()