import os import gradio as gr import tempfile from pathlib import Path # Import vectorstore and embeddings from langchain community package from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings # Text splitter to break large documents into manageable chunks from langchain.text_splitter import RecursiveCharacterTextSplitter # HF Inference client for running multimodal models from huggingface_hub import InferenceClient # Unstructured for PDF processing with image extraction from unstructured.partition.pdf import partition_pdf from unstructured.partition.utils.constants import PartitionStrategy # ── Globals ─────────────────────────────────────────────────────────────────── index = None # FAISS index storing document embeddings retriever = None # Retriever to fetch relevant chunks current_pdf_name = None # Name of the currently loaded PDF extracted_content = None # Combined text and image descriptions # ── HF Inference clients ───────────────────────────────────────────────────── # Text generation client (using a good open model) text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3") # Vision client for image analysis vision_client = InferenceClient(model="llava-hf/llava-1.5-7b-hf") # ── Embeddings ─────────────────────────────────────────────────────────────── # Use BGE embeddings for vectorizing text chunks embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5") # Create temporary directories for processing temp_dir = tempfile.mkdtemp() figures_dir = os.path.join(temp_dir, "figures") os.makedirs(figures_dir, exist_ok=True) def extract_image_description(image_path): """ Analyze an extracted image using vision model to get text description. Args: image_path: Path to the extracted image file Returns: Text description of the image content """ try: # Read image and send to vision model with open(image_path, "rb") as img_file: # Use vision client to analyze the image response = vision_client.text_to_image_generation( prompt="Describe what you see in this image in detail, including any text, charts, diagrams, or important visual elements.", image=img_file.read() ) return f"Image content: {response}" except Exception as e: return f"Image content: [Could not analyze image - {str(e)}]" def process_pdf_multimodal(pdf_file): """ 1. Extracts text and images from PDF using unstructured 2. Analyzes extracted images with vision model 3. Combines text and image descriptions 4. Creates FAISS index for retrieval Args: pdf_file: Uploaded PDF file Returns: - PDF filename, status message, and UI updates """ global current_pdf_name, index, retriever, extracted_content if pdf_file is None: return None, "❌ Please upload a PDF file.", gr.update(interactive=False) current_pdf_name = os.path.basename(pdf_file.name) try: # Clear previous figures for file in os.listdir(figures_dir): os.remove(os.path.join(figures_dir, file)) # Extract elements from PDF including images elements = partition_pdf( pdf_file.name, strategy=PartitionStrategy.HI_RES, extract_image_block_types=["Image", "Table"], extract_image_block_output_dir=figures_dir, extract_image_block_to_payload=False ) # Separate text elements text_elements = [] for element in elements: if element.category not in ["Image", "Table"]: text_elements.append(element.text) # Process extracted images image_descriptions = [] if os.path.exists(figures_dir): for image_file in os.listdir(figures_dir): if image_file.lower().endswith(('.png', '.jpg', '.jpeg')): image_path = os.path.join(figures_dir, image_file) description = extract_image_description(image_path) image_descriptions.append(description) # Combine text and image descriptions all_content = text_elements + image_descriptions extracted_content = "\n\n".join(all_content) # Split into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, add_start_index=True ) chunks = text_splitter.split_text(extracted_content) # Create FAISS index index = FAISS.from_texts(chunks, embeddings) retriever = index.as_retriever(search_kwargs={"k": 3}) # Status message num_images = len(image_descriptions) status = f"✅ Processed '{current_pdf_name}' — {len(chunks)} text chunks, {num_images} images analyzed" return current_pdf_name, status, gr.update(interactive=True) except Exception as e: error_msg = f"❌ Error processing PDF: {str(e)}" return current_pdf_name, error_msg, gr.update(interactive=False) def ask_multimodal_question(pdf_name, question): """ Answer questions using both text and image content from the PDF. Args: pdf_name: Display name (unused) question: User's question Returns: Generated answer combining text and visual information """ global retriever if index is None or retriever is None: return "❌ Please upload and process a PDF first." if not question.strip(): return "❌ Please enter a question." try: # Retrieve relevant chunks (text + image descriptions) docs = retriever.get_relevant_documents(question) context = "\n\n".join(doc.page_content for doc in docs) # Enhanced prompt for multimodal content prompt = ( "You are an AI assistant analyzing a document that contains both text and images. " "Use the following content (which includes text excerpts and descriptions of images/charts/tables) " "to answer the question comprehensively.\n\n" f"Document Content:\n{context}\n\n" f"Question: {question}\n\n" "Provide a detailed answer based on both the textual information and visual elements described above. " "If the answer involves data from charts, tables, or images, mention that explicitly.\n" "Answer:" ) # Generate response response = text_client.chat_completion( messages=[{"role": "user", "content": prompt}], max_tokens=256, temperature=0.5 ) answer = response["choices"][0]["message"]["content"].strip() return answer except Exception as e: return f"❌ Error generating answer: {str(e)}" def generate_multimodal_summary(): """ Generate a summary considering both text and visual elements. """ if not extracted_content: return "❌ Please upload and process a PDF first." try: # Use first 3000 characters for summary content_preview = extracted_content[:3000] prompt = ( "Provide a comprehensive summary of this document that contains both text and visual elements " "(images, charts, tables). Mention key textual information as well as important visual content.\n\n" f"{content_preview}..." ) response = text_client.chat_completion( messages=[{"role": "user", "content": prompt}], max_tokens=200, temperature=0.5 ) return response["choices"][0]["message"]["content"].strip() except Exception as e: return f"❌ Error generating summary: {str(e)}" def extract_multimodal_keywords(): """ Extract keywords from both text and visual content. """ if not extracted_content: return "❌ Please upload and process a PDF first." try: content_preview = extracted_content[:3000] prompt = ( "Extract 10-15 key terms and concepts from this document that contains both text and visual elements. " "Include important terms from both textual content and visual elements like charts, images, and tables.\n\n" f"{content_preview}..." ) response = text_client.chat_completion( messages=[{"role": "user", "content": prompt}], max_tokens=100, temperature=0.5 ) return response["choices"][0]["message"]["content"].strip() except Exception as e: return f"❌ Error extracting keywords: {str(e)}" def clear_multimodal_interface(): """ Reset all global state and clear UI. """ global index, retriever, current_pdf_name, extracted_content # Clear figures directory try: for file in os.listdir(figures_dir): os.remove(os.path.join(figures_dir, file)) except: pass # Reset globals index = retriever = None current_pdf_name = extracted_content = None return None, "", gr.update(interactive=False) # ── Gradio UI ──────────────────────────────────────────────────────────────── theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue") with gr.Blocks(theme=theme, css=""" .container { border-radius: 10px; padding: 15px; } .pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); } .footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; } .main-title { text-align: center; font-size: 64px; font-weight: bold; margin-bottom: 20px; } .multimodal-badge { background: linear-gradient(45deg, #6366f1, #8b5cf6); color: white; padding: 5px 15px; border-radius: 20px; font-size: 14px; display: inline-block; margin: 10px auto; } """) as demo: # Application title with multimodal badge gr.Markdown("
MultiModal DocQueryAI
") gr.Markdown("
🖼️ Text + Images + Charts
") with gr.Row(): with gr.Column(): gr.Markdown("## 📄 Document Input") pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active") pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF (with images/charts)") upload_button = gr.Button("🔄 Process Document (Extract Text + Images)", variant="primary") status_box = gr.Textbox(label="Processing Status", interactive=False) with gr.Column(): gr.Markdown("## ❓ Ask Questions") gr.Markdown("*Ask about text content, images, charts, tables, or any visual elements in your PDF*") question_input = gr.Textbox( lines=3, placeholder="Ask about text, images, charts, or any content in the PDF...", interactive=False ) ask_button = gr.Button("🔍 Ask Question", variant="primary") answer_output = gr.Textbox(label="Answer", lines=8, interactive=False) # Analysis tools with gr.Row(): with gr.Column(): summary_button = gr.Button("📋 Generate Summary", variant="secondary") summary_output = gr.Textbox(label="Document Summary", lines=4, interactive=False) with gr.Column(): keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary") keywords_output = gr.Textbox(label="Key Terms", lines=4, interactive=False) # Clear button clear_button = gr.Button("🗑️ Clear All", variant="secondary") gr.Markdown(""" """) # Event bindings upload_button.click( process_pdf_multimodal, [pdf_file], [pdf_display, status_box, question_input] ) ask_button.click( ask_multimodal_question, [pdf_display, question_input], answer_output ) summary_button.click(generate_multimodal_summary, [], summary_output) keywords_button.click(extract_multimodal_keywords, [], keywords_output) clear_button.click( clear_multimodal_interface, [], [pdf_file, pdf_display, question_input] ) if __name__ == "__main__": demo.launch(debug=True, share=True)