formpilot-demo / rag /qa_chain.py
afulara's picture
Auto‑deploy from GitHub
5ee4946 verified
"""
RAG answer chain for FormPilot.
Call `get_answer(question: str)` to obtain (answer, sources).
"""
from pathlib import Path
from qdrant_client import QdrantClient
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Qdrant
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain import hub
# ---------- static vector store ----------
_EMB = OpenAIEmbeddings(model="text-embedding-3-small")
_QCLIENT = QdrantClient(path="qdrant_data")
_VSTORE = Qdrant(
client=_QCLIENT,
collection_name="formpilot_docs",
embeddings=_EMB,
content_payload_key="text",
)
# ---------- prompt ----------
_SYSTEM = """You are FormPilot, an AI paralegal assistant.
Answer the user's question ONLY with information grounded in the context.
If the answer is not in the context, say "I don't know.".
Always cite sources like (I‑485instr.pdf:page‑X)."""
prompt = PromptTemplate.from_template(
"{context}\n\nQuestion: {question}\nAnswer: (with citations):"
)
# ---------- QA chain ----------
_CHAIN = RetrievalQA.from_chain_type(
llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
chain_type="stuff",
retriever=_VSTORE.as_retriever(search_kwargs={"k": 4}),
chain_type_kwargs={"prompt": prompt},
return_source_documents=True,
)
def get_answer(question: str):
result = _CHAIN(question)
answer = result["result"]
docs = result["source_documents"]
# if no context (or only blank pages), bail out early
if not docs or all(not (d.page_content or "").strip() for d in docs):
return "I don't know.", set()
# build citations only from docs that actually have a source key
citations = {
d.metadata["source"]
for d in docs
if "source" in d.metadata and d.metadata["source"]
}
return answer, citations