File size: 4,029 Bytes
d45d709
 
 
 
 
 
 
 
 
 
 
 
 
d340d21
d45d709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import streamlit as st
import os
from groq import Groq
import tempfile
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gdown
import re

# -------------------- CONFIG -------------------- #
# Set your Groq API key from environment
client = Groq(api_key=os.environ.get("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"))

# Predefined list of Google Drive PDF links (public/viewable)
PDF_LINKS = [
    "https://drive.google.com/file/d/14ZxfkSKBbvHINQnE9gWY8fwlaV9KBD4Y/view?usp=sharing",
    "https://drive.google.com/file/d/1LONqUI8Us6WDbWi35ueM5qin9bIhI0Bz/view?usp=sharing",
    "https://drive.google.com/file/d/1Robrjv3n9ckEcXUJj7zQw5o7YZ4C52tf/view?usp=sharing",
    "https://drive.google.com/file/d/113_0ixx4LyjCtfZHZYePbYMRs8mdfieG/view?usp=sharing"
]

# -------------------- UTILITY FUNCTIONS -------------------- #

# Convert Google Drive URL to downloadable ID
def extract_gdrive_id(url):
    match = re.search(r'/d/([a-zA-Z0-9_-]+)', url)
    return match.group(1) if match else None

# Download file from Google Drive
def download_from_gdrive(gdrive_url):
    file_id = extract_gdrive_id(gdrive_url)
    if not file_id:
        return None
    download_url = f"https://drive.google.com/uc?id={file_id}"
    output = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
    gdown.download(download_url, output, quiet=True)
    return output

# Extract raw text from a PDF
def extract_text(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

# Chunk text into small pieces
def chunk_text(text, chunk_size=300):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Create FAISS index
def create_faiss_index(chunks):
    embeddings = embed_model.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index, embeddings

# Search top-k chunks
def search_chunks(query, chunks, index):
    query_embedding = embed_model.encode([query])
    D, I = index.search(np.array(query_embedding), k=3)
    return [chunks[i] for i in I[0]]

# -------------------- MAIN APP -------------------- #

# Load embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

st.title("πŸ“š ReliefBot – Auto-Fetched Disaster PDFs with QA")

if "initialized" not in st.session_state:
    st.session_state.initialized = False

if not st.session_state.initialized:
    all_chunks = []

    with st.spinner("πŸ“₯ Downloading & processing predefined documents..."):
        for url in PDF_LINKS:
            path = download_from_gdrive(url.strip())
            if path:
                raw_text = extract_text(path)
                chunks = chunk_text(raw_text)
                all_chunks.extend(chunks)
            else:
                st.warning(f"⚠️ Failed to download: {url}")

        if all_chunks:
            index, _ = create_faiss_index(all_chunks)
            st.session_state.chunks = all_chunks
            st.session_state.index = index
            st.session_state.initialized = True
            st.success("βœ… Documents downloaded and indexed successfully!")

# Question interface
if st.session_state.initialized:
    query = st.text_input("❓ Ask your question about disaster safety or documents:")

    if query:
        relevant_chunks = search_chunks(query, st.session_state.chunks, st.session_state.index)
        context = "\n".join(relevant_chunks)
        prompt = f"Answer based on this context:\n{context}\n\nQuestion: {query}"

        with st.spinner("πŸ€– Generating answer..."):
            response = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="llama-3.1-8b-instant"
            )
            st.markdown("### πŸ’¬ ReliefBot Answer:")
            st.write(response.choices[0].message.content)