File size: 2,940 Bytes
33eca15 323149c 1b89b73 323149c 1b89b73 323149c 1b89b73 323149c 1b89b73 323149c 1b89b73 323149c 1b89b73 323149c 1b89b73 33eca15 1b89b73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import streamlit as st
import PyPDF2
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tempfile
# Load local models once
@st.cache_resource
def load_models():
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
qa_pipeline_model = pipeline("text2text-generation", model="google/flan-t5-base")
return tokenizer, model, qa_pipeline_model
embedding_tokenizer, embedding_model, qa_pipeline_model = load_models()
# PDF loader
def load_pdf(file):
reader = PyPDF2.PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text() or ''
return text
# Embed text
def get_embedding(text):
inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
model_output = embedding_model(**inputs)
return model_output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
# Store vectors in-memory
vector_store = []
def upload_document_chunks(chunks):
vector_store.clear()
for chunk in chunks:
embedding = get_embedding(chunk)
vector_store.append((chunk, embedding))
def query_answer(query):
query_vec = get_embedding(query)
similarities = [cosine_similarity([query_vec], [vec])[0][0] for _, vec in vector_store]
top_indices = np.argsort(similarities)[-3:][::-1]
return [vector_store[i][0] for i in top_indices]
def generate_response(context, query):
prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
response = qa_pipeline_model(prompt, max_new_tokens=100, do_sample=True)
return response[0]['generated_text'].strip()
# Streamlit UI
st.set_page_config(page_title="Offline PDF QA Bot", layout="centered")
st.title("π Offline PDF QA Bot π")
st.markdown(
"Upload a PDF document, ask a question, and get an answer using **only local models** β no external APIs involved."
)
uploaded_file = st.file_uploader("π Upload PDF", type="pdf")
user_query = st.text_input("β Ask a question based on the document")
if uploaded_file and user_query:
with st.spinner("Processing..."):
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(uploaded_file.read())
document_text = load_pdf(tmp_file.name)
document_chunks = [document_text[i:i + 500] for i in range(0, len(document_text), 500)]
upload_document_chunks(document_chunks)
top_chunks = query_answer(user_query)
context = " ".join(top_chunks)
answer = generate_response(context, user_query)
st.subheader("π Retrieved Document Segments")
for i, chunk in enumerate(top_chunks, 1):
st.markdown(f"**Chunk {i}:** {chunk}")
st.subheader("π¬ Answer")
st.success(answer)
|