# Arxiv Metadata Dataset - Loader and Retriever

- Load Arxiv Metadata from Hugging Face DataSet and Load in to Qdrant
- Use LangGraph to store trace info

In [None]:
%pip install -qU pymupdf 
%pip install -qU langchain langchain-core langchain-community langchain-text-splitters 
%pip install -qU langchain-openai
%pip install -qU langchain-groq
%pip install -qU langchain-qdrant

In [None]:
# Parameterize some stuff

QUESTION = "What are the emerging patterns for building Systems of Agents that could provide the system the ability to evolve and improve its own processes through learning?"

In [None]:
import os
from langchain import hub
from langchain_groq import ChatGroq
from config import COLLECTION_NAME, DATASET_NAME, OPENAI_API_KEY, QDRANT_API_KEY, QDRANT_API_URL, LANGCHAIN_HUB_PROMPT
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import Qdrant
# idenify data loader for html documents

In [None]:
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(model="text-embedding-3-small")
prompt = hub.pull(LANGCHAIN_HUB_PROMPT)

In [None]:
# URL Path is retrieved from the dataset
# need to use another loader for HTML documents

# iterate over retrieved records from the huggingface dataset
URL_PATH = # need to retrieve the URL path from the dataset
loader = PyMuPDFLoader(URL_PATH, extract_images=True)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
# Store the chunks in Qdrant
from_splits = Qdrant.from_documents(
    embedding=embedding,
    collection_name=COLLECTION_NAME,
    url=QDRANT_API_URL,
    api_key=QDRANT_API_KEY,
    prefer_grpc=True,   
    documents=splits,
)

## Retrieve Information using Metadata in Vector Store

In [None]:
qdrant = Qdrant.from_existing_collection(
    embedding=embedding,
    collection_name=COLLECTION_NAME,
    url=QDRANT_API_URL,
    api_key=QDRANT_API_KEY,
    prefer_grpc=True,     
)

retriever = qdrant.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.5, "k": 5}
)

In [None]:
from langchain_groq import ChatGroq
from operator import itemgetter
from langchain.schema.runnable import RunnablePassthrough

llm = ChatGroq(model="llama3-70b-8192", temperature=0.3)

rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}
)

In [None]:
print(rag_chain.get_graph().draw_ascii())

In [None]:
response = rag_chain.invoke({"question" : QUESTION})

In [None]:
# return the response.  filter on the response key AIMessage content element
print(response["response"].content)


In [None]:
response["context"]