File size: 2,940 Bytes
33eca15
323149c
1b89b73
 
323149c
1b89b73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323149c
 
1b89b73
 
 
 
 
 
 
 
 
 
 
 
 
 
323149c
1b89b73
323149c
1b89b73
 
 
323149c
1b89b73
323149c
1b89b73
 
 
33eca15
1b89b73
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import streamlit as st
import PyPDF2
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tempfile

# Load local models once
@st.cache_resource
def load_models():
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    qa_pipeline_model = pipeline("text2text-generation", model="google/flan-t5-base")
    return tokenizer, model, qa_pipeline_model

embedding_tokenizer, embedding_model, qa_pipeline_model = load_models()

# PDF loader
def load_pdf(file):
    reader = PyPDF2.PdfReader(file)
    text = ''
    for page in reader.pages:
        text += page.extract_text() or ''
    return text

# Embed text
def get_embedding(text):
    inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        model_output = embedding_model(**inputs)
    return model_output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Store vectors in-memory
vector_store = []

def upload_document_chunks(chunks):
    vector_store.clear()
    for chunk in chunks:
        embedding = get_embedding(chunk)
        vector_store.append((chunk, embedding))

def query_answer(query):
    query_vec = get_embedding(query)
    similarities = [cosine_similarity([query_vec], [vec])[0][0] for _, vec in vector_store]
    top_indices = np.argsort(similarities)[-3:][::-1]
    return [vector_store[i][0] for i in top_indices]

def generate_response(context, query):
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    response = qa_pipeline_model(prompt, max_new_tokens=100, do_sample=True)
    return response[0]['generated_text'].strip()

# Streamlit UI
st.set_page_config(page_title="Offline PDF QA Bot", layout="centered")
st.title("πŸ“„ Offline PDF QA Bot πŸ”")
st.markdown(
    "Upload a PDF document, ask a question, and get an answer using **only local models** β€” no external APIs involved."
)

uploaded_file = st.file_uploader("πŸ“ Upload PDF", type="pdf")
user_query = st.text_input("❓ Ask a question based on the document")

if uploaded_file and user_query:
    with st.spinner("Processing..."):
        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
            tmp_file.write(uploaded_file.read())
            document_text = load_pdf(tmp_file.name)

        document_chunks = [document_text[i:i + 500] for i in range(0, len(document_text), 500)]

        upload_document_chunks(document_chunks)
        top_chunks = query_answer(user_query)
        context = " ".join(top_chunks)

        answer = generate_response(context, user_query)

    st.subheader("πŸ“œ Retrieved Document Segments")
    for i, chunk in enumerate(top_chunks, 1):
        st.markdown(f"**Chunk {i}:** {chunk}")

    st.subheader("πŸ’¬ Answer")
    st.success(answer)