import streamlit as st import PyPDF2 import torch from transformers import AutoTokenizer, AutoModel, pipeline from sklearn.metrics.pairwise import cosine_similarity import numpy as np import tempfile # Load local models once @st.cache_resource def load_models(): tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') qa_pipeline_model = pipeline("text2text-generation", model="google/flan-t5-base") return tokenizer, model, qa_pipeline_model embedding_tokenizer, embedding_model, qa_pipeline_model = load_models() # PDF loader def load_pdf(file): reader = PyPDF2.PdfReader(file) text = '' for page in reader.pages: text += page.extract_text() or '' return text # Embed text def get_embedding(text): inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): model_output = embedding_model(**inputs) return model_output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy() # Store vectors in-memory vector_store = [] def upload_document_chunks(chunks): vector_store.clear() for chunk in chunks: embedding = get_embedding(chunk) vector_store.append((chunk, embedding)) def query_answer(query): query_vec = get_embedding(query) similarities = [cosine_similarity([query_vec], [vec])[0][0] for _, vec in vector_store] top_indices = np.argsort(similarities)[-3:][::-1] return [vector_store[i][0] for i in top_indices] def generate_response(context, query): prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:" response = qa_pipeline_model(prompt, max_new_tokens=100, do_sample=True) return response[0]['generated_text'].strip() # Streamlit UI st.set_page_config(page_title="Offline PDF QA Bot", layout="centered") st.title("📄 Offline PDF QA Bot 🔍") st.markdown( "Upload a PDF document, ask a question, and get an answer using **only local models** — no external APIs involved." ) uploaded_file = st.file_uploader("📁 Upload PDF", type="pdf") user_query = st.text_input("❓ Ask a question based on the document") if uploaded_file and user_query: with st.spinner("Processing..."): with tempfile.NamedTemporaryFile(delete=False) as tmp_file: tmp_file.write(uploaded_file.read()) document_text = load_pdf(tmp_file.name) document_chunks = [document_text[i:i + 500] for i in range(0, len(document_text), 500)] upload_document_chunks(document_chunks) top_chunks = query_answer(user_query) context = " ".join(top_chunks) answer = generate_response(context, user_query) st.subheader("📜 Retrieved Document Segments") for i, chunk in enumerate(top_chunks, 1): st.markdown(f"**Chunk {i}:** {chunk}") st.subheader("💬 Answer") st.success(answer)