Spaces:
Sleeping
Sleeping
File size: 4,029 Bytes
d45d709 d340d21 d45d709 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import streamlit as st
import os
from groq import Groq
import tempfile
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gdown
import re
# -------------------- CONFIG -------------------- #
# Set your Groq API key from environment
client = Groq(api_key=os.environ.get("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"))
# Predefined list of Google Drive PDF links (public/viewable)
PDF_LINKS = [
"https://drive.google.com/file/d/14ZxfkSKBbvHINQnE9gWY8fwlaV9KBD4Y/view?usp=sharing",
"https://drive.google.com/file/d/1LONqUI8Us6WDbWi35ueM5qin9bIhI0Bz/view?usp=sharing",
"https://drive.google.com/file/d/1Robrjv3n9ckEcXUJj7zQw5o7YZ4C52tf/view?usp=sharing",
"https://drive.google.com/file/d/113_0ixx4LyjCtfZHZYePbYMRs8mdfieG/view?usp=sharing"
]
# -------------------- UTILITY FUNCTIONS -------------------- #
# Convert Google Drive URL to downloadable ID
def extract_gdrive_id(url):
match = re.search(r'/d/([a-zA-Z0-9_-]+)', url)
return match.group(1) if match else None
# Download file from Google Drive
def download_from_gdrive(gdrive_url):
file_id = extract_gdrive_id(gdrive_url)
if not file_id:
return None
download_url = f"https://drive.google.com/uc?id={file_id}"
output = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
gdown.download(download_url, output, quiet=True)
return output
# Extract raw text from a PDF
def extract_text(file_path):
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
# Chunk text into small pieces
def chunk_text(text, chunk_size=300):
words = text.split()
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
# Create FAISS index
def create_faiss_index(chunks):
embeddings = embed_model.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
return index, embeddings
# Search top-k chunks
def search_chunks(query, chunks, index):
query_embedding = embed_model.encode([query])
D, I = index.search(np.array(query_embedding), k=3)
return [chunks[i] for i in I[0]]
# -------------------- MAIN APP -------------------- #
# Load embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
st.title("π ReliefBot β Auto-Fetched Disaster PDFs with QA")
if "initialized" not in st.session_state:
st.session_state.initialized = False
if not st.session_state.initialized:
all_chunks = []
with st.spinner("π₯ Downloading & processing predefined documents..."):
for url in PDF_LINKS:
path = download_from_gdrive(url.strip())
if path:
raw_text = extract_text(path)
chunks = chunk_text(raw_text)
all_chunks.extend(chunks)
else:
st.warning(f"β οΈ Failed to download: {url}")
if all_chunks:
index, _ = create_faiss_index(all_chunks)
st.session_state.chunks = all_chunks
st.session_state.index = index
st.session_state.initialized = True
st.success("β
Documents downloaded and indexed successfully!")
# Question interface
if st.session_state.initialized:
query = st.text_input("β Ask your question about disaster safety or documents:")
if query:
relevant_chunks = search_chunks(query, st.session_state.chunks, st.session_state.index)
context = "\n".join(relevant_chunks)
prompt = f"Answer based on this context:\n{context}\n\nQuestion: {query}"
with st.spinner("π€ Generating answer..."):
response = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama-3.1-8b-instant"
)
st.markdown("### π¬ ReliefBot Answer:")
st.write(response.choices[0].message.content)
|