Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
from groq import Groq | |
import tempfile | |
from PyPDF2 import PdfReader | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
import gdown | |
import re | |
# -------------------- CONFIG -------------------- # | |
# Set your Groq API key from environment | |
client = Groq(api_key=os.environ.get("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")) | |
# Predefined list of Google Drive PDF links (public/viewable) | |
PDF_LINKS = [ | |
"https://drive.google.com/file/d/14ZxfkSKBbvHINQnE9gWY8fwlaV9KBD4Y/view?usp=sharing", | |
"https://drive.google.com/file/d/1LONqUI8Us6WDbWi35ueM5qin9bIhI0Bz/view?usp=sharing", | |
"https://drive.google.com/file/d/1Robrjv3n9ckEcXUJj7zQw5o7YZ4C52tf/view?usp=sharing", | |
"https://drive.google.com/file/d/113_0ixx4LyjCtfZHZYePbYMRs8mdfieG/view?usp=sharing" | |
] | |
# -------------------- UTILITY FUNCTIONS -------------------- # | |
# Convert Google Drive URL to downloadable ID | |
def extract_gdrive_id(url): | |
match = re.search(r'/d/([a-zA-Z0-9_-]+)', url) | |
return match.group(1) if match else None | |
# Download file from Google Drive | |
def download_from_gdrive(gdrive_url): | |
file_id = extract_gdrive_id(gdrive_url) | |
if not file_id: | |
return None | |
download_url = f"https://drive.google.com/uc?id={file_id}" | |
output = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name | |
gdown.download(download_url, output, quiet=True) | |
return output | |
# Extract raw text from a PDF | |
def extract_text(file_path): | |
reader = PdfReader(file_path) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() or "" | |
return text | |
# Chunk text into small pieces | |
def chunk_text(text, chunk_size=300): | |
words = text.split() | |
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] | |
# Create FAISS index | |
def create_faiss_index(chunks): | |
embeddings = embed_model.encode(chunks) | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(np.array(embeddings)) | |
return index, embeddings | |
# Search top-k chunks | |
def search_chunks(query, chunks, index): | |
query_embedding = embed_model.encode([query]) | |
D, I = index.search(np.array(query_embedding), k=3) | |
return [chunks[i] for i in I[0]] | |
# -------------------- MAIN APP -------------------- # | |
# Load embedding model | |
embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
st.title("π ReliefBot β Auto-Fetched Disaster PDFs with QA") | |
if "initialized" not in st.session_state: | |
st.session_state.initialized = False | |
if not st.session_state.initialized: | |
all_chunks = [] | |
with st.spinner("π₯ Downloading & processing predefined documents..."): | |
for url in PDF_LINKS: | |
path = download_from_gdrive(url.strip()) | |
if path: | |
raw_text = extract_text(path) | |
chunks = chunk_text(raw_text) | |
all_chunks.extend(chunks) | |
else: | |
st.warning(f"β οΈ Failed to download: {url}") | |
if all_chunks: | |
index, _ = create_faiss_index(all_chunks) | |
st.session_state.chunks = all_chunks | |
st.session_state.index = index | |
st.session_state.initialized = True | |
st.success("β Documents downloaded and indexed successfully!") | |
# Question interface | |
if st.session_state.initialized: | |
query = st.text_input("β Ask your question about disaster safety or documents:") | |
if query: | |
relevant_chunks = search_chunks(query, st.session_state.chunks, st.session_state.index) | |
context = "\n".join(relevant_chunks) | |
prompt = f"Answer based on this context:\n{context}\n\nQuestion: {query}" | |
with st.spinner("π€ Generating answer..."): | |
response = client.chat.completions.create( | |
messages=[{"role": "user", "content": prompt}], | |
model="llama-3.1-8b-instant" | |
) | |
st.markdown("### π¬ ReliefBot Answer:") | |
st.write(response.choices[0].message.content) | |