Spaces:

samiasohail25gmailcom
/

demo6

Sleeping

File size: 4,029 Bytes

import streamlit as st
import os
from groq import Groq
import tempfile
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gdown
import re

# -------------------- CONFIG -------------------- #
# Set your Groq API key from environment
client = Groq(api_key=os.environ.get("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"))

# Predefined list of Google Drive PDF links (public/viewable)
PDF_LINKS = [
    "https://drive.google.com/file/d/14ZxfkSKBbvHINQnE9gWY8fwlaV9KBD4Y/view?usp=sharing",
    "https://drive.google.com/file/d/1LONqUI8Us6WDbWi35ueM5qin9bIhI0Bz/view?usp=sharing",
    "https://drive.google.com/file/d/1Robrjv3n9ckEcXUJj7zQw5o7YZ4C52tf/view?usp=sharing",
    "https://drive.google.com/file/d/113_0ixx4LyjCtfZHZYePbYMRs8mdfieG/view?usp=sharing"
]

# -------------------- UTILITY FUNCTIONS -------------------- #

# Convert Google Drive URL to downloadable ID
def extract_gdrive_id(url):
    match = re.search(r'/d/([a-zA-Z0-9_-]+)', url)
    return match.group(1) if match else None

# Download file from Google Drive
def download_from_gdrive(gdrive_url):
    file_id = extract_gdrive_id(gdrive_url)
    if not file_id:
        return None
    download_url = f"https://drive.google.com/uc?id={file_id}"
    output = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
    gdown.download(download_url, output, quiet=True)
    return output

# Extract raw text from a PDF
def extract_text(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

# Chunk text into small pieces
def chunk_text(text, chunk_size=300):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Create FAISS index
def create_faiss_index(chunks):
    embeddings = embed_model.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return index, embeddings

# Search top-k chunks
def search_chunks(query, chunks, index):
    query_embedding = embed_model.encode([query])
    D, I = index.search(np.array(query_embedding), k=3)
    return [chunks[i] for i in I[0]]

# -------------------- MAIN APP -------------------- #

# Load embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

st.title("📚 ReliefBot – Auto-Fetched Disaster PDFs with QA")

if "initialized" not in st.session_state:
    st.session_state.initialized = False

if not st.session_state.initialized:
    all_chunks = []

    with st.spinner("📥 Downloading & processing predefined documents..."):
        for url in PDF_LINKS:
            path = download_from_gdrive(url.strip())
            if path:
                raw_text = extract_text(path)
                chunks = chunk_text(raw_text)
                all_chunks.extend(chunks)
            else:
                st.warning(f"⚠️ Failed to download: {url}")

        if all_chunks:
            index, _ = create_faiss_index(all_chunks)
            st.session_state.chunks = all_chunks
            st.session_state.index = index
            st.session_state.initialized = True
            st.success("✅ Documents downloaded and indexed successfully!")

# Question interface
if st.session_state.initialized:
    query = st.text_input("❓ Ask your question about disaster safety or documents:")

    if query:
        relevant_chunks = search_chunks(query, st.session_state.chunks, st.session_state.index)
        context = "\n".join(relevant_chunks)
        prompt = f"Answer based on this context:\n{context}\n\nQuestion: {query}"

        with st.spinner("🤖 Generating answer..."):
            response = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="llama-3.1-8b-instant"
            )
            st.markdown("### 💬 ReliefBot Answer:")
            st.write(response.choices[0].message.content)