import streamlit as st import os from groq import Groq import tempfile from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer import faiss import numpy as np import gdown import re # -------------------- CONFIG -------------------- # # Set your Groq API key from environment client = Groq(api_key=os.environ.get("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")) # Predefined list of Google Drive PDF links (public/viewable) PDF_LINKS = [ "https://drive.google.com/file/d/14ZxfkSKBbvHINQnE9gWY8fwlaV9KBD4Y/view?usp=sharing", "https://drive.google.com/file/d/1LONqUI8Us6WDbWi35ueM5qin9bIhI0Bz/view?usp=sharing", "https://drive.google.com/file/d/1Robrjv3n9ckEcXUJj7zQw5o7YZ4C52tf/view?usp=sharing", "https://drive.google.com/file/d/113_0ixx4LyjCtfZHZYePbYMRs8mdfieG/view?usp=sharing" ] # -------------------- UTILITY FUNCTIONS -------------------- # # Convert Google Drive URL to downloadable ID def extract_gdrive_id(url): match = re.search(r'/d/([a-zA-Z0-9_-]+)', url) return match.group(1) if match else None # Download file from Google Drive def download_from_gdrive(gdrive_url): file_id = extract_gdrive_id(gdrive_url) if not file_id: return None download_url = f"https://drive.google.com/uc?id={file_id}" output = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name gdown.download(download_url, output, quiet=True) return output # Extract raw text from a PDF def extract_text(file_path): reader = PdfReader(file_path) text = "" for page in reader.pages: text += page.extract_text() or "" return text # Chunk text into small pieces def chunk_text(text, chunk_size=300): words = text.split() return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] # Create FAISS index def create_faiss_index(chunks): embeddings = embed_model.encode(chunks) dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(np.array(embeddings)) return index, embeddings # Search top-k chunks def search_chunks(query, chunks, index): query_embedding = embed_model.encode([query]) D, I = index.search(np.array(query_embedding), k=3) return [chunks[i] for i in I[0]] # -------------------- MAIN APP -------------------- # # Load embedding model embed_model = SentenceTransformer('all-MiniLM-L6-v2') st.title("📚 ReliefBot – Auto-Fetched Disaster PDFs with QA") if "initialized" not in st.session_state: st.session_state.initialized = False if not st.session_state.initialized: all_chunks = [] with st.spinner("📥 Downloading & processing predefined documents..."): for url in PDF_LINKS: path = download_from_gdrive(url.strip()) if path: raw_text = extract_text(path) chunks = chunk_text(raw_text) all_chunks.extend(chunks) else: st.warning(f"⚠️ Failed to download: {url}") if all_chunks: index, _ = create_faiss_index(all_chunks) st.session_state.chunks = all_chunks st.session_state.index = index st.session_state.initialized = True st.success("✅ Documents downloaded and indexed successfully!") # Question interface if st.session_state.initialized: query = st.text_input("❓ Ask your question about disaster safety or documents:") if query: relevant_chunks = search_chunks(query, st.session_state.chunks, st.session_state.index) context = "\n".join(relevant_chunks) prompt = f"Answer based on this context:\n{context}\n\nQuestion: {query}" with st.spinner("🤖 Generating answer..."): response = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model="llama-3.1-8b-instant" ) st.markdown("### 💬 ReliefBot Answer:") st.write(response.choices[0].message.content)