import os
import gradio as gr
import tempfile
from pathlib import Path

# Import vectorstore and embeddings from langchain community package
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
# Text splitter to break large documents into manageable chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
# HF Inference client for running multimodal models
from huggingface_hub import InferenceClient
# Unstructured for PDF processing with image extraction
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.utils.constants import PartitionStrategy

# ── Globals ───────────────────────────────────────────────────────────────────
index = None               # FAISS index storing document embeddings
retriever = None           # Retriever to fetch relevant chunks
current_pdf_name = None    # Name of the currently loaded PDF
extracted_content = None   # Combined text and image descriptions

# ── HF Inference clients ─────────────────────────────────────────────────────
# Text generation client (using a good open model)
text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")
# Vision client for image analysis
vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf")

# ── Embeddings ───────────────────────────────────────────────────────────────
# Use BGE embeddings for vectorizing text chunks
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Create temporary directories for processing
temp_dir = tempfile.mkdtemp()
figures_dir = os.path.join(temp_dir, "figures")
os.makedirs(figures_dir, exist_ok=True)

def extract_image_description(image_path):
    """
    Analyze an extracted image using vision model to get text description.
    Args:
        image_path: Path to the extracted image file
    Returns:
        Text description of the image content
    """
    try:
        # Read image and send to vision model
        with open(image_path, "rb") as img_file:
            # Use vision client to analyze the image
            response = vision_client.text_to_image_generation(
                prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.",
                image=img_file.read()
            )
            return f"Image content: {response}"
    except Exception as e:
        return f"Image content: [Could not analyze image - {str(e)}]"

def process_pdf_multimodal(pdf_file):
    """
    1. Extracts text and images from PDF using unstructured
    2. Analyzes extracted images with vision model
    3. Combines text and image descriptions
    4. Creates FAISS index for retrieval
    Args:
        pdf_file: Uploaded PDF file
    Returns:
        - PDF filename, status message, and UI updates
    """
    global current_pdf_name, index, retriever, extracted_content

    if pdf_file is None:
        return None, "❌ Please upload a PDF file.", gr.update(interactive=False)

    current_pdf_name = os.path.basename(pdf_file.name)
    
    try:
        # Clear previous figures
        for file in os.listdir(figures_dir):
            os.remove(os.path.join(figures_dir, file))
        
        # Extract elements from PDF including images
        elements = partition_pdf(
            pdf_file.name,
            strategy=PartitionStrategy.HI_RES,
            extract_image_block_types=["Image", "Table"],
            extract_image_block_output_dir=figures_dir,
            extract_image_block_to_payload=False
        )
        
        # Separate text elements
        text_elements = []
        for element in elements:
            if element.category not in ["Image", "Table"]:
                text_elements.append(element.text)
        
        # Process extracted images
        image_descriptions = []
        if os.path.exists(figures_dir):
            for image_file in os.listdir(figures_dir):
                if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    image_path = os.path.join(figures_dir, image_file)
                    description = extract_image_description(image_path)
                    image_descriptions.append(description)
        
        # Combine text and image descriptions
        all_content = text_elements + image_descriptions
        extracted_content = "\n\n".join(all_content)
        
        # Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            add_start_index=True
        )
        chunks = text_splitter.split_text(extracted_content)
        
        # Create FAISS index
        index = FAISS.from_texts(chunks, embeddings)
        retriever = index.as_retriever(search_kwargs={"k": 3})
        
        # Status message
        num_images = len(image_descriptions)
        status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} text chunks, {num_images} images analyzed"
        
        return current_pdf_name, status, gr.update(interactive=True)
        
    except Exception as e:
        error_msg = f"❌ Error processing PDF: {str(e)}"
        return current_pdf_name, error_msg, gr.update(interactive=False)

def ask_multimodal_question(pdf_name, question):
    """
    Answer questions using both text and image content from the PDF.
    Args:
        pdf_name: Display name (unused)
        question: User's question
    Returns:
        Generated answer combining text and visual information
    """
    global retriever
    
    if index is None or retriever is None:
        return "❌ Please upload and process a PDF first."
    
    if not question.strip():
        return "❌ Please enter a question."
    
    try:
        # Retrieve relevant chunks (text + image descriptions)
        docs = retriever.get_relevant_documents(question)
        context = "\n\n".join(doc.page_content for doc in docs)
        
        # Enhanced prompt for multimodal content
        prompt = (
            "You are an AI assistant analyzing a document that contains both text and images. "
            "Use the following content (which includes text excerpts and descriptions of images/charts/tables) "
            "to answer the question comprehensively.\n\n"
            f"Document Content:\n{context}\n\n"
            f"Question: {question}\n\n"
            "Provide a detailed answer based on both the textual information and visual elements described above. "
            "If the answer involves data from charts, tables, or images, mention that explicitly.\n"
            "Answer:"
        )
        
        # Generate response
        response = text_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=256,
            temperature=0.5
        )
        
        answer = response["choices"][0]["message"]["content"].strip()
        return answer
        
    except Exception as e:
        return f"❌ Error generating answer: {str(e)}"

def generate_multimodal_summary():
    """
    Generate a summary considering both text and visual elements.
    """
    if not extracted_content:
        return "❌ Please upload and process a PDF first."
    
    try:
        # Use first 3000 characters for summary
        content_preview = extracted_content[:3000]
        
        prompt = (
            "Provide a comprehensive summary of this document that contains both text and visual elements "
            "(images, charts, tables). Mention key textual information as well as important visual content.\n\n"
            f"{content_preview}..."
        )
        
        response = text_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200,
            temperature=0.5
        )
        
        return response["choices"][0]["message"]["content"].strip()
        
    except Exception as e:
        return f"❌ Error generating summary: {str(e)}"

def extract_multimodal_keywords():
    """
    Extract keywords from both text and visual content.
    """
    if not extracted_content:
        return "❌ Please upload and process a PDF first."
    
    try:
        content_preview = extracted_content[:3000]
        
        prompt = (
            "Extract 10-15 key terms and concepts from this document that contains both text and visual elements. "
            "Include important terms from both textual content and visual elements like charts, images, and tables.\n\n"
            f"{content_preview}..."
        )
        
        response = text_client.chat_completion(
            messages=[{"role": "user", "content": prompt}],
            max_tokens=100,
            temperature=0.5
        )
        
        return response["choices"][0]["message"]["content"].strip()
        
    except Exception as e:
        return f"❌ Error extracting keywords: {str(e)}"

def clear_multimodal_interface():
    """
    Reset all global state and clear UI.
    """
    global index, retriever, current_pdf_name, extracted_content
    
    # Clear figures directory
    try:
        for file in os.listdir(figures_dir):
            os.remove(os.path.join(figures_dir, file))
    except:
        pass
    
    # Reset globals
    index = retriever = None
    current_pdf_name = extracted_content = None
    
    return None, "", gr.update(interactive=False)

# ── Gradio UI ────────────────────────────────────────────────────────────────
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")

with gr.Blocks(theme=theme, css="""
    .container { border-radius: 10px; padding: 15px; }
    .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
    .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
    .main-title {
        text-align: center;
        font-size: 64px;
        font-weight: bold;
        margin-bottom: 20px;
    }
    .multimodal-badge {
        background: linear-gradient(45deg, #6366f1, #8b5cf6);
        color: white;
        padding: 5px 15px;
        border-radius: 20px;
        font-size: 14px;
        display: inline-block;
        margin: 10px auto;
    }
""") as demo:
    
    # Application title with multimodal badge
    gr.Markdown("<div class='main-title'>MultiModal DocQueryAI</div>")
    gr.Markdown("<div style='text-align: center;'><span class='multimodal-badge'>🖼️ Text + Images + Charts</span></div>")

    with gr.Row():
        with gr.Column():
            gr.Markdown("## 📄 Document Input")
            pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
            pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)")
            upload_button = gr.Button("🔄 Process Document (Extract Text + Images)", variant="primary")
            status_box = gr.Textbox(label="Processing Status", interactive=False)

        with gr.Column():
            gr.Markdown("## ❓ Ask Questions")
            gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*")
            question_input = gr.Textbox(
                lines=3, 
                placeholder="Ask about text, images, charts, or any content in the PDF...",
                interactive=False
            )
            ask_button = gr.Button("🔍 Ask Question", variant="primary")
            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)

    # Analysis tools
    with gr.Row():
        with gr.Column():
            summary_button = gr.Button("📋 Generate Summary", variant="secondary")
            summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False)
        with gr.Column():
            keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
            keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False)

    # Clear button
    clear_button = gr.Button("🗑️ Clear All", variant="secondary")
    
    gr.Markdown("""
    <div class='footer'>
        Powered by LangChain + Unstructured + Vision AI + FAISS | 
        Supports: Text, Images, Charts, Tables, Diagrams
    </div>
    """)

    # Event bindings
    upload_button.click(
        process_pdf_multimodal, 
        [pdf_file], 
        [pdf_display, status_box, question_input]
    )
    ask_button.click(
        ask_multimodal_question, 
        [pdf_display, question_input], 
        answer_output
    )
    summary_button.click(generate_multimodal_summary, [], summary_output)
    keywords_button.click(extract_multimodal_keywords, [], keywords_output)
    clear_button.click(
        clear_multimodal_interface, 
        [], 
        [pdf_file, pdf_display, question_input]
    )

if __name__ == "__main__":
    demo.launch(debug=True, share=True)