PDFQA / src /streamlit_app.py
gaur3009's picture
Update src/streamlit_app.py
1b89b73 verified
import streamlit as st
import PyPDF2
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tempfile
# Load local models once
@st.cache_resource
def load_models():
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
qa_pipeline_model = pipeline("text2text-generation", model="google/flan-t5-base")
return tokenizer, model, qa_pipeline_model
embedding_tokenizer, embedding_model, qa_pipeline_model = load_models()
# PDF loader
def load_pdf(file):
reader = PyPDF2.PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text() or ''
return text
# Embed text
def get_embedding(text):
inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
model_output = embedding_model(**inputs)
return model_output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
# Store vectors in-memory
vector_store = []
def upload_document_chunks(chunks):
vector_store.clear()
for chunk in chunks:
embedding = get_embedding(chunk)
vector_store.append((chunk, embedding))
def query_answer(query):
query_vec = get_embedding(query)
similarities = [cosine_similarity([query_vec], [vec])[0][0] for _, vec in vector_store]
top_indices = np.argsort(similarities)[-3:][::-1]
return [vector_store[i][0] for i in top_indices]
def generate_response(context, query):
prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
response = qa_pipeline_model(prompt, max_new_tokens=100, do_sample=True)
return response[0]['generated_text'].strip()
# Streamlit UI
st.set_page_config(page_title="Offline PDF QA Bot", layout="centered")
st.title("πŸ“„ Offline PDF QA Bot πŸ”")
st.markdown(
"Upload a PDF document, ask a question, and get an answer using **only local models** β€” no external APIs involved."
)
uploaded_file = st.file_uploader("πŸ“ Upload PDF", type="pdf")
user_query = st.text_input("❓ Ask a question based on the document")
if uploaded_file and user_query:
with st.spinner("Processing..."):
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(uploaded_file.read())
document_text = load_pdf(tmp_file.name)
document_chunks = [document_text[i:i + 500] for i in range(0, len(document_text), 500)]
upload_document_chunks(document_chunks)
top_chunks = query_answer(user_query)
context = " ".join(top_chunks)
answer = generate_response(context, user_query)
st.subheader("πŸ“œ Retrieved Document Segments")
for i, chunk in enumerate(top_chunks, 1):
st.markdown(f"**Chunk {i}:** {chunk}")
st.subheader("πŸ’¬ Answer")
st.success(answer)