Spaces:

vikramronavrsc
/

BLOCHAIN_RAG_FOR_LASTDAY_EXAM_PREP

Running

File size: 29,810 Bytes

# main_metamask.py
import os
import tempfile
import shutil
import PyPDF2
import streamlit as st
import torch
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
import time
import psutil
import uuid
import atexit
from blockchain_utils_metamask import BlockchainManagerMetaMask
from metamask_component import metamask_connector


class BlockchainEnabledRAG:
    def __init__(self, 
                 llm_model_name="mistralai/Mistral-7B-Instruct-v0.2",
                 embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
                 chunk_size=1000,
                 chunk_overlap=200,
                 use_gpu=True,
                 use_blockchain=False,
                 contract_address=None):
        """
        Initialize the GPU-efficient RAG system with MetaMask blockchain integration.
        
        Args:
            llm_model_name: The HuggingFace model for text generation
            embedding_model_name: The HuggingFace model for embeddings
            chunk_size: Size of document chunks
            chunk_overlap: Overlap between chunks
            use_gpu: Whether to use GPU acceleration
            use_blockchain: Whether to enable blockchain verification
            contract_address: Address of the deployed RAG Document Verifier contract
        """
        self.llm_model_name = llm_model_name
        self.embedding_model_name = embedding_model_name
        self.use_gpu = use_gpu and torch.cuda.is_available()
        self.use_blockchain = use_blockchain
        
        # Device selection for embeddings
        self.device = "cuda" if self.use_gpu else "cpu"
        st.sidebar.info(f"Using device: {self.device}")
        
        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
        
        # Initialize embeddings model
        self.embeddings = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            model_kwargs={"device": self.device}
        )
        
        # Initialize LLM using HuggingFaceHub instead of Ollama
        try:
            # Use HF_TOKEN from environment variables
            hf_token = os.environ.get("HF_TOKEN")
            if not hf_token:
                st.warning("No HuggingFace token found. Using model without authentication.")
                
            self.llm = HuggingFaceHub(
                repo_id=llm_model_name,
                huggingfacehub_api_token=hf_token,
                model_kwargs={"temperature": 0.7, "max_length": 1024}
            )
        except Exception as e:
            st.error(f"Error initializing LLM: {str(e)}")
            st.info("Trying to initialize with default model...")
            # Fallback to a smaller model
            self.llm = HuggingFaceHub(
                repo_id="google/flan-t5-small",
                model_kwargs={"temperature": 0.7, "max_length": 512}
            )
        
        # Initialize vector store
        self.vector_store = None
        self.documents_processed = 0
        
        # Monitoring stats
        self.processing_times = {}
        
        # Initialize blockchain manager if enabled
        self.blockchain = None
        if use_blockchain:
            try:
                self.blockchain = BlockchainManagerMetaMask(
                    contract_address=contract_address
                )
                st.sidebar.success("Blockchain manager initialized. Please connect MetaMask to continue.")
            except Exception as e:
                st.sidebar.error(f"Failed to initialize blockchain manager: {str(e)}")
                self.use_blockchain = False

    def update_blockchain_connection(self, metamask_info):
        """Update blockchain connection with MetaMask info."""
        if self.blockchain and metamask_info:
            self.blockchain.update_connection(
                is_connected=metamask_info.get("connected", False),
                user_address=metamask_info.get("address"),
                network_id=metamask_info.get("network_id")
            )
            return self.blockchain.is_connected
        return False

    def process_pdfs(self, pdf_files):
        """Process PDF files, create a vector store, and verify documents on blockchain."""
        all_docs = []
        
        with st.status("Processing PDF files...") as status:
            # Create temporary directory for file storage
            temp_dir = tempfile.mkdtemp()
            st.session_state['temp_dir'] = temp_dir
            
            # Monitor processing time and memory usage
            start_time = time.time()
            
            # Track memory before processing
            mem_before = psutil.virtual_memory().used / (1024 * 1024 * 1024)  # GB
            
            # Process each PDF file
            for i, pdf_file in enumerate(pdf_files):
                try:
                    file_start_time = time.time()
                    
                    # Save uploaded file to temp directory
                    pdf_path = os.path.join(temp_dir, pdf_file.name)
                    with open(pdf_path, "wb") as f:
                        f.write(pdf_file.getbuffer())
                    
                    status.update(label=f"Processing {pdf_file.name} ({i+1}/{len(pdf_files)})...")
                    
                    # Extract text from PDF
                    text = ""
                    with open(pdf_path, "rb") as f:
                        pdf = PyPDF2.PdfReader(f)
                        for page_num in range(len(pdf.pages)):
                            page = pdf.pages[page_num]
                            page_text = page.extract_text()
                            if page_text:
                                text += page_text + "\n\n"
                    
                    # Create documents
                    docs = [Document(page_content=text, metadata={"source": pdf_file.name})]
                    
                    # Split documents into chunks
                    split_docs = self.text_splitter.split_documents(docs)
                    
                    all_docs.extend(split_docs)
                    
                    # Verify document on blockchain if enabled and connected
                    if self.use_blockchain and self.blockchain and self.blockchain.is_connected:
                        try:
                            # Create a unique document ID
                            document_id = f"{pdf_file.name}_{uuid.uuid4().hex[:8]}"
                            
                            # Verify document on blockchain
                            status.update(label=f"Verifying {pdf_file.name} on blockchain...")
                            verification = self.blockchain.verify_document(document_id, pdf_path)
                            
                            if verification.get('status'):  # Success
                                st.sidebar.success(f"✅ {pdf_file.name} verified on blockchain")
                                if 'tx_hash' in verification:
                                    st.sidebar.info(f"Transaction: {verification['tx_hash'][:10]}...")
                                
                                # Add blockchain metadata to documents
                                for doc in split_docs:
                                    doc.metadata["blockchain"] = {
                                        "verified": True,
                                        "document_id": document_id,
                                        "document_hash": verification.get("document_hash", ""),
                                        "tx_hash": verification.get("tx_hash", ""),
                                        "block_number": verification.get("block_number", 0)
                                    }
                            else:
                                st.sidebar.warning(f"❌ Failed to verify {pdf_file.name} on blockchain")
                                if 'error' in verification:
                                    st.sidebar.error(f"Error: {verification['error']}")
                        except Exception as e:
                            st.sidebar.error(f"Blockchain verification error: {str(e)}")
                    elif self.use_blockchain:
                        st.sidebar.warning("MetaMask not connected. Document not verified on blockchain.")
                    
                    file_end_time = time.time()
                    processing_time = file_end_time - file_start_time
                    
                    st.sidebar.success(f"Processed {pdf_file.name}: {len(split_docs)} chunks in {processing_time:.2f}s")
                    self.processing_times[pdf_file.name] = {
                        "chunks": len(split_docs),
                        "time": processing_time
                    }
                    
                except Exception as e:
                    st.sidebar.error(f"Error processing {pdf_file.name}: {str(e)}")
            
            # Create vector store if we have documents
            if all_docs:
                status.update(label="Building vector index...")
                try:
                    # Record the time taken to build the index
                    index_start_time = time.time()
                    
                    # Create the vector store using FAISS
                    self.vector_store = FAISS.from_documents(all_docs, self.embeddings)
                    
                    index_end_time = time.time()
                    index_time = index_end_time - index_start_time
                    
                    # Track memory after processing
                    mem_after = psutil.virtual_memory().used / (1024 * 1024 * 1024)  # GB
                    mem_used = mem_after - mem_before
                    
                    total_time = time.time() - start_time
                    
                    status.update(label=f"Completed processing {len(all_docs)} chunks in {total_time:.2f}s", state="complete")
                    
                    # Save performance metrics
                    self.processing_times["index_building"] = index_time
                    self.processing_times["total_time"] = total_time
                    self.processing_times["memory_used_gb"] = mem_used
                    self.documents_processed = len(all_docs)
                    
                    return True
                except Exception as e:
                    st.error(f"Error creating vector store: {str(e)}")
                    status.update(label="Error creating vector store", state="error")
                    return False
            else:
                status.update(label="No content extracted from PDFs", state="error")
                return False

    def ask(self, query):
        """Ask a question and get an answer based on the PDFs with blockchain logging."""
        if not self.vector_store:
            return "Please upload and process PDF files first."
            
        try:
            # Custom prompt
            prompt_template = """
            You are an AI assistant that provides accurate information based on PDF documents.
            
            Use the following context to answer the question. Be detailed and precise in your answer.
            If the answer is not in the context, say "I don't have enough information to answer this question."
            
            Context:
            {context}
            
            Question: {question}
            
            Answer:
            """
            PROMPT = PromptTemplate(
                template=prompt_template, 
                input_variables=["context", "question"]
            )
            
            # Start timing the query
            query_start_time = time.time()
            
            # Create QA chain
            chain_type_kwargs = {"prompt": PROMPT}
            qa = RetrievalQA.from_chain_type(
                llm=self.llm,
                chain_type="stuff",
                retriever=self.vector_store.as_retriever(search_kwargs={"k": 4}),
                chain_type_kwargs=chain_type_kwargs,
                return_source_documents=True
            )
            
            # Get answer
            with st.status("Searching documents and generating answer..."):
                response = qa({"query": query})
                
            answer = response["result"]
            source_docs = response["source_documents"]
            
            # Calculate query time
            query_time = time.time() - query_start_time
            
            # Format sources
            sources = []
            for i, doc in enumerate(source_docs):
                # Extract blockchain verification info if available
                blockchain_info = None
                if "blockchain" in doc.metadata:
                    blockchain_info = {
                        "verified": doc.metadata["blockchain"]["verified"],
                        "document_id": doc.metadata["blockchain"]["document_id"],
                        "tx_hash": doc.metadata["blockchain"]["tx_hash"]
                    }
                
                sources.append({
                    "content": doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content,
                    "source": doc.metadata.get("source", "Unknown"),
                    "blockchain": blockchain_info
                })
            
            # Log query to blockchain if enabled and connected
            blockchain_log = None
            if self.use_blockchain and self.blockchain and self.blockchain.is_connected:
                try:
                    with st.status("Logging query to blockchain..."):
                        log_result = self.blockchain.log_query(query, answer)
                        
                        if log_result.get("status"):  # Success
                            blockchain_log = {
                                "logged": True,
                                "query_id": log_result.get("query_id", ""),
                                "tx_hash": log_result.get("tx_hash", ""),
                                "block_number": log_result.get("block_number", 0)
                            }
                        else:
                            st.error(f"Error logging to blockchain: {log_result.get('error', 'Unknown error')}")
                except Exception as e:
                    st.error(f"Error logging to blockchain: {str(e)}")
                
            return {
                "answer": answer,
                "sources": sources,
                "query_time": query_time,
                "blockchain_log": blockchain_log
            }
                
        except Exception as e:
            st.error(f"Error generating answer: {str(e)}")
            return f"Error: {str(e)}"

    def get_performance_metrics(self):
        """Return performance metrics for the RAG system."""
        if not self.processing_times:
            return None
            
        return {
            "documents_processed": self.documents_processed,
            "index_building_time": self.processing_times.get("index_building", 0),
            "total_processing_time": self.processing_times.get("total_time", 0),
            "memory_used_gb": self.processing_times.get("memory_used_gb", 0),
            "device": self.device,
            "embedding_model": self.embedding_model_name,
            "blockchain_enabled": self.use_blockchain,
            "blockchain_connected": self.blockchain.is_connected if self.blockchain else False
        }


# Helper function to initialize session state
def initialize_session_state():
    """Initialize Streamlit session state variables."""
    if "rag" not in st.session_state:
        st.session_state.rag = None
    if "messages" not in st.session_state:
        st.session_state.messages = []
    if "temp_dir" not in st.session_state:
        st.session_state.temp_dir = None
    if "metamask_connected" not in st.session_state:
        st.session_state.metamask_connected = False

# Helper function to clean up temporary files
def cleanup_temp_files():
    """Clean up temporary files when application exits."""
    if st.session_state.get('temp_dir') and os.path.exists(st.session_state.temp_dir):
        try:
            shutil.rmtree(st.session_state.temp_dir)
            print(f"Cleaned up temporary directory: {st.session_state.temp_dir}")
        except Exception as e:
            print(f"Error cleaning up temporary directory: {e}")


# Streamlit UI
def main():
    st.set_page_config(page_title="Blockchain-Enabled RAG System", layout="wide")
    
    st.title("🚀 GPU-Accelerated PDF Question Answering with MetaMask Blockchain Verification")
    st.markdown("Upload PDFs, verify them on blockchain with MetaMask, and ask questions with audit log")
    
    # Initialize session state
    initialize_session_state()
    
    # MetaMask Connection Section
    st.header("🦊 MetaMask Connection")
    st.markdown("Connect your MetaMask wallet to verify documents and log queries on the blockchain.")
    
    # Add MetaMask connector and get connection info
    metamask_info = metamask_connector()
    
    # Display MetaMask connection status
    if metamask_info and metamask_info.get("connected"):
        st.success(f"✅ MetaMask Connected: {metamask_info.get('address')}")
        st.info(f"Network: {metamask_info.get('network_name')}")
        st.session_state.metamask_connected = True
    else:
        st.warning("⚠️ MetaMask not connected. Please connect your wallet to use blockchain features.")
        st.session_state.metamask_connected = False
    
    # Update RAG system with MetaMask connection if needed
    if st.session_state.rag and metamask_info:
        is_connected = st.session_state.rag.update_blockchain_connection(metamask_info)
        if is_connected:
            st.success("RAG system updated with MetaMask connection")
    
    # Sidebar for configuration and file upload
    with st.sidebar:
        st.header("⚙️ Configuration")
        
        # GPU Detection
        gpu_available = torch.cuda.is_available()
        if gpu_available:
            try:
                gpu_info = torch.cuda.get_device_properties(0)
                st.success(f"GPU detected: {gpu_info.name} ({gpu_info.total_memory / 1024**3:.1f} GB)")
            except Exception as e:
                st.warning(f"GPU detected but couldn't get properties: {str(e)}")
                st.info("Running with limited GPU information")
        else:
            st.warning("No GPU detected. Running in CPU mode.")
        
        # Model selection
        llm_model = st.selectbox(
            "LLM Model",
            options=[
                "mistralai/Mistral-7B-Instruct-v0.2",
                "google/flan-t5-base",
                "tiiuae/falcon-7b-instruct"
            ],
            index=0
        )
        
        embedding_model = st.selectbox(
            "Embedding Model",
            options=[
                "sentence-transformers/all-mpnet-base-v2",
                "sentence-transformers/all-MiniLM-L6-v2",
                "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
            ],
            index=1  # all-MiniLM-L6-v2 is smaller and faster
        )
        
        use_gpu = st.checkbox("Use GPU Acceleration", value=gpu_available)
        
        # Blockchain configuration
        st.header("🔗 Blockchain Configuration")
        use_blockchain = st.checkbox("Enable Blockchain Verification", value=True)
        
        if use_blockchain:
            contract_address = st.text_input("Contract Address", 
                                            value="0x0000000000000000000000000000000000000000")
            
            # Display MetaMask connection status in sidebar
            if metamask_info and metamask_info.get("connected"):
                st.success(f"✅ MetaMask Connected: {metamask_info.get('address')[:10]}...")
            else:
                st.warning("⚠️ MetaMask not connected. Please connect your wallet above.")
            
            if not contract_address or contract_address == "0x0000000000000000000000000000000000000000":
                st.error("Please deploy the contract and enter its address")
        
        # Advanced options
        with st.expander("Advanced Options"):
            chunk_size = st.slider("Chunk Size", 100, 2000, 1000)
            chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200)
        
        # Initialize button
        if st.button("Initialize System"):
            with st.spinner("Initializing RAG system..."):
                if use_blockchain and not contract_address:
                    st.error("Contract address is required for blockchain integration")
                else:
                    st.session_state.rag = BlockchainEnabledRAG(
                        llm_model_name=llm_model,
                        embedding_model_name=embedding_model,
                        chunk_size=chunk_size,
                        chunk_overlap=chunk_overlap,
                        use_gpu=use_gpu and gpu_available,
                        use_blockchain=use_blockchain,
                        contract_address=contract_address if use_blockchain else None
                    )
                    
                    # Update with current MetaMask connection if available
                    if use_blockchain and metamask_info:
                        st.session_state.rag.update_blockchain_connection(metamask_info)
                    
                    st.success(f"System initialized with {embedding_model} on {st.session_state.rag.device}")
                    if use_blockchain:
                        if metamask_info and metamask_info.get("connected"):
                            st.success("Blockchain verification enabled with MetaMask")
                        else:
                            st.warning("Blockchain verification enabled but MetaMask not connected")
        
        st.header("📄 Upload Documents")
        uploaded_files = st.file_uploader("Select PDFs", type="pdf", accept_multiple_files=True)
        
        if uploaded_files and st.button("Process PDFs"):
            if not st.session_state.rag:
                with st.spinner("Initializing RAG system..."):
                    st.session_state.rag = BlockchainEnabledRAG(
                        llm_model_name=llm_model,
                        embedding_model_name=embedding_model,
                        chunk_size=chunk_size,
                        chunk_overlap=chunk_overlap,
                        use_gpu=use_gpu and gpu_available,
                        use_blockchain=use_blockchain,
                        contract_address=contract_address if use_blockchain else None
                    )
                    
                    # Update with current MetaMask connection if available
                    if use_blockchain and metamask_info:
                        st.session_state.rag.update_blockchain_connection(metamask_info)
            
            success = st.session_state.rag.process_pdfs(uploaded_files)
            if success:
                metrics = st.session_state.rag.get_performance_metrics()
                if metrics:
                    st.success("PDFs processed successfully!")
                    with st.expander("💹 Performance Metrics"):
                        st.markdown(f"**Documents processed:** {metrics['documents_processed']} chunks")
                        st.markdown(f"**Index building time:** {metrics['index_building_time']:.2f} seconds")
                        st.markdown(f"**Total processing time:** {metrics['total_processing_time']:.2f} seconds")
                        st.markdown(f"**Memory used:** {metrics['memory_used_gb']:.2f} GB")
                        st.markdown(f"**Device used:** {metrics['device']}")
                        st.markdown(f"**Blockchain verification:** {'Enabled' if metrics['blockchain_enabled'] else 'Disabled'}")
                        st.markdown(f"**Blockchain connected:** {'Yes' if metrics.get('blockchain_connected') else 'No'}")
    
    # Blockchain verification info
    if st.session_state.rag and st.session_state.rag.use_blockchain:
        if st.session_state.metamask_connected:
            st.info("🔗 Blockchain verification is enabled with MetaMask. Documents are cryptographically verified and queries are logged with immutable audit trail.")
        else:
            st.warning("🔗 Blockchain verification is enabled but MetaMask is not connected. Please connect your MetaMask wallet to use blockchain features.")
    
    # Display chat messages
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            if message["role"] == "user":
                st.markdown(message["content"])
            else:
                if isinstance(message["content"], dict):
                    st.markdown(message["content"]["answer"])
                    
                    if "query_time" in message["content"]:
                        st.caption(f"Response time: {message['content']['query_time']:.2f} seconds")
                    
                    # Display blockchain log if available
                    if "blockchain_log" in message["content"] and message["content"]["blockchain_log"]:
                        blockchain_log = message["content"]["blockchain_log"]
                        st.success(f"✅ Query logged on blockchain | Transaction: {blockchain_log['tx_hash'][:10]}...")
                    
                    # Display sources in expander
                    if "sources" in message["content"] and message["content"]["sources"]:
                        with st.expander("📄 View Sources"):
                            for i, source in enumerate(message["content"]["sources"]):
                                st.markdown(f"**Source {i+1}: {source['source']}**")
                                
                                # Show blockchain verification if available
                                if source.get("blockchain"):
                                    st.success(f"✅ Verified on blockchain | TX: {source['blockchain']['tx_hash'][:10]}...")
                                    
                                st.text(source["content"])
                                st.divider()
                else:
                    st.markdown(message["content"])
    
    # Chat input
    if prompt := st.chat_input("Ask a question about your PDFs..."):
        # Add user message to chat
        st.session_state.messages.append({"role": "user", "content": prompt})
        
        # Display user message
        with st.chat_message("user"):
            st.markdown(prompt)
        
        # Check if system is initialized
        if not st.session_state.rag:
            with st.chat_message("assistant"):
                message = "Please initialize the system and process PDFs first."
                st.markdown(message)
                st.session_state.messages.append({"role": "assistant", "content": message})
        
        # Get response if vector store is ready
        elif st.session_state.rag.vector_store:
            with st.chat_message("assistant"):
                response = st.session_state.rag.ask(prompt)
                st.session_state.messages.append({"role": "assistant", "content": response})
                
                if isinstance(response, dict):
                    st.markdown(response["answer"])
                    
                    if "query_time" in response:
                        st.caption(f"Response time: {response['query_time']:.2f} seconds")
                    
                    # Display blockchain log if available
                    if "blockchain_log" in response and response["blockchain_log"]:
                        blockchain_log = response["blockchain_log"]
                        st.success(f"✅ Query logged on blockchain | Transaction: {blockchain_log['tx_hash'][:10]}...")
                    
                    # Display sources in expander
                    if "sources" in response and response["sources"]:
                        with st.expander("📄 View Sources"):
                            for i, source in enumerate(response["sources"]):
                                st.markdown(f"**Source {i+1}: {source['source']}**")
                                
                                # Show blockchain verification if available
                                if source.get("blockchain"):
                                    st.success(f"✅ Verified on blockchain | TX: {source['blockchain']['tx_hash'][:10]}...")
                                    
                                st.text(source["content"])
                                st.divider()
                else:
                    st.markdown(response)
        else:
            with st.chat_message("assistant"):
                message = "Please upload and process PDF files first."
                st.markdown(message)
                st.session_state.messages.append({"role": "assistant", "content": message})


# Main entry point
if __name__ == "__main__":
    # Register cleanup function
    atexit.register(cleanup_temp_files)
    
    main()