|
import streamlit as st |
|
import PyPDF2 |
|
import torch |
|
from transformers import AutoTokenizer, AutoModel, pipeline |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
import tempfile |
|
|
|
|
|
@st.cache_resource |
|
def load_models(): |
|
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') |
|
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') |
|
qa_pipeline_model = pipeline("text2text-generation", model="google/flan-t5-base") |
|
return tokenizer, model, qa_pipeline_model |
|
|
|
embedding_tokenizer, embedding_model, qa_pipeline_model = load_models() |
|
|
|
|
|
def load_pdf(file): |
|
reader = PyPDF2.PdfReader(file) |
|
text = '' |
|
for page in reader.pages: |
|
text += page.extract_text() or '' |
|
return text |
|
|
|
|
|
def get_embedding(text): |
|
inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True) |
|
with torch.no_grad(): |
|
model_output = embedding_model(**inputs) |
|
return model_output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy() |
|
|
|
|
|
vector_store = [] |
|
|
|
def upload_document_chunks(chunks): |
|
vector_store.clear() |
|
for chunk in chunks: |
|
embedding = get_embedding(chunk) |
|
vector_store.append((chunk, embedding)) |
|
|
|
def query_answer(query): |
|
query_vec = get_embedding(query) |
|
similarities = [cosine_similarity([query_vec], [vec])[0][0] for _, vec in vector_store] |
|
top_indices = np.argsort(similarities)[-3:][::-1] |
|
return [vector_store[i][0] for i in top_indices] |
|
|
|
def generate_response(context, query): |
|
prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:" |
|
response = qa_pipeline_model(prompt, max_new_tokens=100, do_sample=True) |
|
return response[0]['generated_text'].strip() |
|
|
|
|
|
st.set_page_config(page_title="Offline PDF QA Bot", layout="centered") |
|
st.title("π Offline PDF QA Bot π") |
|
st.markdown( |
|
"Upload a PDF document, ask a question, and get an answer using **only local models** β no external APIs involved." |
|
) |
|
|
|
uploaded_file = st.file_uploader("π Upload PDF", type="pdf") |
|
user_query = st.text_input("β Ask a question based on the document") |
|
|
|
if uploaded_file and user_query: |
|
with st.spinner("Processing..."): |
|
with tempfile.NamedTemporaryFile(delete=False) as tmp_file: |
|
tmp_file.write(uploaded_file.read()) |
|
document_text = load_pdf(tmp_file.name) |
|
|
|
document_chunks = [document_text[i:i + 500] for i in range(0, len(document_text), 500)] |
|
|
|
upload_document_chunks(document_chunks) |
|
top_chunks = query_answer(user_query) |
|
context = " ".join(top_chunks) |
|
|
|
answer = generate_response(context, user_query) |
|
|
|
st.subheader("π Retrieved Document Segments") |
|
for i, chunk in enumerate(top_chunks, 1): |
|
st.markdown(f"**Chunk {i}:** {chunk}") |
|
|
|
st.subheader("π¬ Answer") |
|
st.success(answer) |
|
|