import gradio as gr
import os
import re
import PyPDF2
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json

class PDFAnalyzer:
    def __init__(self):
        self.text_chunks = []
        self.embeddings = None
        self.active_doc = None
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def process_pdf(self, filepath):
        try:
            text = self._extract_text(filepath)
            self.text_chunks = self._chunk_text(text)
            self.embeddings = self.model.encode(self.text_chunks)
            self.active_doc = os.path.basename(filepath)
            return json.dumps({
                "status": 200,
                "message": f"Document {self.active_doc} processed successfully",
                "document_id": hash(self.active_doc)
            })
        except Exception as e:
            return json.dumps({
                "status": 500,
                "error": str(e),
                "message": "Document processing failed"
            })

    def _extract_text(self, filepath):
        with open(filepath, 'rb') as f:
            return ''.join([page.extract_text() for page in PyPDF2.PdfReader(f).pages])

    def _chunk_text(self, text):
        return [text[i:i+500] for i in range(0, len(text), 500)]

    def query(self, question):
        if not self.active_doc:
            return json.dumps({
                "status": 400,
                "message": "No document uploaded",
                "results": []
            })
        
        ques_emb = self.model.encode(question)
        similarities = cosine_similarity([ques_emb], self.embeddings)[0]
        best_idx = np.argmax(similarities)
        
        # Convert NumPy types to native Python types
        confidence = float(similarities[best_idx].item())  # Convert to native float
        best_idx = int(best_idx.item())  # Convert to native int
        
        full_answer = self.text_chunks[best_idx]
        
        return json.dumps({
            "status": 200,
            "message": "Success",
            "results": [{
                "text": self._format_answer(full_answer, question),
                "confidence": confidence,
                "document_id": str(hash(self.active_doc)),  # Convert to string
                "metadata": {
                    "chunk_index": best_idx,
                    "document": self.active_doc
                }
            }]
        }, default=lambda x: str(x))  # Fallback string conversion
    
    def _format_answer(self, text, question):
        # Extract focused answer with 100-word context
        sentences = re.split(r'(?<=[.!?]) +', text)
        question_words = set(question.lower().split())
        
        best_sentence = max(sentences, 
                          key=lambda s: len(set(s.lower().split()) & question_words),
                          default="")
        
        all_words = ' '.join(sentences).split()
        try:
            start = max(0, all_words.index(best_sentence.split()[0]) - 50)
            end = start + 100
        except:
            start = 0
            end = 100
            
        return ' '.join(all_words[start:end]) + ("..." if end < len(all_words) else "")

def create_app():
    analyzer = PDFAnalyzer()
    
    def format_response(response):
        try:
            data = json.loads(response)
            if data['status'] != 200:
                return f"Error: {data.get('message', 'Unknown error')}"
            
            result = data['results'][0]
            return f"**Answer** ({result['confidence']:.2f} confidence):\n{result['text']}"
        except:
            return "Error processing response"

    with gr.Blocks(theme=gr.themes.Soft()) as app:
        gr.Markdown("# 📑 PDF QA Assistant (Cohere-style API)")
        
        with gr.Row():
            with gr.Column(scale=1):
                pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
                status = gr.Markdown("**Status:** Idle")
                gr.Button("Process PDF").click(
                    lambda f: analyzer.process_pdf(f.name) if f else json.dumps({"status": 400, "error": "No file"}),
                    inputs=pdf_upload,
                    outputs=status
                )
            
            with gr.Column(scale=2):
                chatbot = gr.Chatbot(height=400)
                question = gr.Textbox(label="Query", placeholder="Enter your question...")
                question.submit(
                    lambda q,h: h + [(q, format_response(analyzer.query(q)))],
                    inputs=[question, chatbot],
                    outputs=chatbot
                )
                gr.Button("Clear Session").click(
                    lambda: [None, None, "**Status:** Session cleared"],
                    outputs=[chatbot, pdf_upload, status]
                )

    return app

if __name__ == "__main__":
    create_app().launch()