Spaces:

Muzammil6376
/

Multimodal

Sleeping

App Files Files Community

Multimodal / app.py

Muzammil6376

Update app.py

7154bdc verified 2 months ago

raw

history blame

22.6 kB

	import os
	import gradio as gr
	import base64
	from PIL import Image
	import io
	import requests

	# Import vectorstore and embeddings from langchain community package
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	# Text splitter to break large documents into manageable chunks
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	# HF Inference client for running chat completions
	from huggingface_hub import InferenceClient
	# Unstructured for advanced PDF processing with image/table extraction
	from unstructured.partition.pdf import partition_pdf
	from unstructured.partition.utils.constants import PartitionStrategy

	# ── Globals ───────────────────────────────────────────────────────────────────
	index = None # FAISS index storing document embeddings
	retriever = None # Retriever to fetch relevant chunks
	current_pdf_name = None # Name of the currently loaded PDF
	pdf_text = None # Full text of the uploaded PDF
	extracted_images = [] # List to store extracted images and their descriptions

	# Create directories for storing extracted figures
	FIGURES_DIR = "extracted_figures/"
	os.makedirs(FIGURES_DIR, exist_ok=True)

	# ── HF Inference clients for different models ─────────────────────────────────
	# Text generation model
	text_client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3")

	# Vision-Language Models (choose one based on your needs and HF availability)
	# Option 1: BLIP-2 for general image understanding
	vision_client = InferenceClient(model="Salesforce/blip2-opt-2.7b")

	# Option 2: Alternative vision models you can use:
	# vision_client = InferenceClient(model="microsoft/git-base-coco")
	# vision_client = InferenceClient(model="nlpconnect/vit-gpt2-image-captioning")
	# vision_client = InferenceClient(model="Salesforce/blip-image-captioning-large")

	# For more advanced multimodal tasks, you can use:
	# multimodal_client = InferenceClient(model="microsoft/DialoGPT-medium") # For conversational AI
	# multimodal_client = InferenceClient(model="facebook/opt-iml-max-30b") # For instruction following

	# ── Multimodal Embeddings ───────────────────────────────────────────────────
	# Primary: CLIP embeddings for excellent text-image alignment
	try:
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/clip-ViT-B-32",
	model_kwargs={'device': 'cpu'}, # Ensure CPU usage for HF Spaces
	encode_kwargs={'normalize_embeddings': True}
	)
	print("✅ Using CLIP embeddings for multimodal support")
	except Exception as e:
	print(f"⚠️ CLIP failed, falling back to BGE: {e}")
	# Fallback to BGE embeddings
	embeddings = HuggingFaceEmbeddings(
	model_name="BAAI/bge-base-en-v1.5",
	model_kwargs={'device': 'cpu'},
	encode_kwargs={'normalize_embeddings': True}
	)

	def create_multimodal_embeddings(text_chunks, image_descriptions):
	"""
	Create embeddings that combine text and visual information
	"""
	try:
	all_chunks = []

	# Process text chunks
	for chunk in text_chunks:
	# Add context markers for better embedding
	enhanced_chunk = f"Document text: {chunk}"
	all_chunks.append(enhanced_chunk)

	# Process image descriptions with special formatting
	for img_desc in image_descriptions:
	# Mark visual content for better embedding alignment
	enhanced_desc = f"Visual content: {img_desc}"
	all_chunks.append(enhanced_desc)

	return all_chunks

	except Exception as e:
	print(f"Error creating multimodal embeddings: {e}")
	return text_chunks + image_descriptions
	"""
	Enhanced image description using multiple vision models
	"""
	try:
	# Load and process image
	with open(image_path, "rb") as f:
	image_bytes = f.read()

	# Method 1: Use BLIP-2 for detailed image captioning
	try:
	description = vision_client.image_to_text(image_bytes)
	base_description = description if isinstance(description, str) else description.get('generated_text', '')
	except Exception as e:
	print(f"BLIP-2 failed: {e}")
	base_description = "Image could not be processed with vision model"

	# Method 2: Enhance with text-based analysis using the text model
	enhancement_prompt = f"""
	Analyze this image description and provide a detailed analysis focusing on:
	1. Any text, numbers, or data visible
	2. Charts, graphs, or tables
	3. Key visual elements and their significance
	4. Context and meaning

	Description: {base_description}

	Provide a comprehensive analysis:
	"""

	try:
	response = text_client.chat_completion(
	messages=[{"role": "user", "content": enhancement_prompt}],
	max_tokens=300,
	temperature=0.3
	)
	enhanced_description = response["choices"][0]["message"]["content"].strip()
	except Exception as e:
	print(f"Text enhancement failed: {e}")
	enhanced_description = base_description

	return f"Visual Element Analysis:\n{enhanced_description}"

	except Exception as e:
	print(f"Error processing image {image_path}: {str(e)}")
	return f"Visual element detected: {os.path.basename(image_path)} (processing failed)"

	def process_pdf_multimodal_advanced(pdf_file):
	"""
	Advanced multimodal PDF processing with enhanced vision capabilities
	"""
	global current_pdf_name, index, retriever, pdf_text, extracted_images

	if pdf_file is None:
	return None, "❌ Please upload a PDF file.", gr.update(interactive=False)

	current_pdf_name = os.path.basename(pdf_file.name)
	extracted_images = []

	# Clear existing figures directory
	for file in os.listdir(FIGURES_DIR):
	try:
	os.remove(os.path.join(FIGURES_DIR, file))
	except:
	pass

	try:
	# Process PDF with unstructured
	elements = partition_pdf(
	pdf_file.name,
	strategy=PartitionStrategy.HI_RES,
	extract_image_block_types=["Image", "Table"],
	extract_image_block_output_dir=FIGURES_DIR,
	extract_image_block_to_payload=False,
	# Additional parameters for better extraction
	infer_table_structure=True,
	chunking_strategy="by_title",
	max_characters=1000,
	combine_text_under_n_chars=100
	)

	# Process elements
	text_elements = []
	visual_descriptions = []

	for element in elements:
	if element.category in ["Image", "Table"]:
	# Handle image/table elements
	continue
	elif element.category == "Title":
	text_elements.append(f"TITLE: {element.text}")
	elif element.category == "Header":
	text_elements.append(f"HEADER: {element.text}")
	else:
	if hasattr(element, 'text') and element.text.strip():
	text_elements.append(element.text)

	pdf_text = "\n\n".join(text_elements)

	# Process extracted visual elements
	if os.path.exists(FIGURES_DIR):
	for filename in sorted(os.listdir(FIGURES_DIR)):
	if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
	image_path = os.path.join(FIGURES_DIR, filename)

	# Get enhanced description
	description = extract_image_description_advanced(image_path)
	visual_descriptions.append(description)

	extracted_images.append({
	'path': image_path,
	'description': description,
	'filename': filename,
	'type': 'table' if 'table' in filename.lower() else 'image'
	})

	# Combine all content
	all_content = text_elements + visual_descriptions

	# Combine text and visual content with enhanced embedding strategy
	text_chunks = text_splitter.split_text(pdf_text) if pdf_text else []

	# Create multimodal embeddings
	all_chunks = create_multimodal_embeddings(text_chunks, visual_descriptions)

	# Create FAISS index with optimized settings for multimodal content
	if all_chunks:
	index = FAISS.from_texts(all_chunks, embeddings)
	retriever = index.as_retriever(
	search_type="mmr", # Maximum marginal relevance for diverse results
	search_kwargs={
	"k": 5, # Get more results for multimodal content
	"fetch_k": 10, # Broader initial search
	"lambda_mult": 0.6 # Balance between relevance and diversity
	}
	)
	else:
	raise Exception("No content extracted from PDF")

	status = f"✅ Advanced processing complete for '{current_pdf_name}'\n📄 {len(text_elements)} text sections\n🖼️ {len(extracted_images)} visual elements\n📦 {len(all_chunks)} total searchable chunks"

	return current_pdf_name, status, gr.update(interactive=True)

	except Exception as e:
	error_msg = f"❌ Processing error: {str(e)}"
	return current_pdf_name, error_msg, gr.update(interactive=False)

	def ask_question_multimodal_advanced(pdf_name, question):
	"""
	Advanced multimodal question answering with smart routing
	"""
	global retriever, extracted_images

	if index is None or retriever is None:
	return "❌ Please upload and process a PDF first."

	if not question.strip():
	return "❌ Please enter a question."

	try:
	# Retrieve relevant chunks
	docs = retriever.get_relevant_documents(question)
	context = "\n\n".join([doc.page_content for doc in docs])

	# Enhanced visual query detection
	visual_keywords = [
	'image', 'figure', 'chart', 'graph', 'table', 'diagram', 'picture',
	'visual', 'show', 'display', 'plot', 'data', 'visualization',
	'illustration', 'screenshot', 'photo', 'drawing'
	]

	is_visual_query = any(keyword in question.lower() for keyword in visual_keywords)

	# Smart context enhancement
	if is_visual_query and extracted_images:
	# Prioritize visual content for visual queries
	visual_context = "\n\n".join([img['description'] for img in extracted_images])
	enhanced_context = f"{visual_context}\n\nAdditional Context:\n{context}"
	else:
	enhanced_context = context

	# Advanced prompting based on query type
	if is_visual_query:
	system_prompt = """You are an expert document analyst specializing in multimodal content analysis.
	You excel at interpreting charts, graphs, tables, images, and visual data alongside textual information.
	When answering questions about visual elements, be specific about what you observe and provide detailed insights."""
	else:
	system_prompt = """You are an expert document analyst. Provide accurate, comprehensive answers based on the document content.
	Use the context provided to give detailed and helpful responses."""

	prompt = f"""{system_prompt}

	Context: {enhanced_context}

	Question: {question}

	Provide a detailed, accurate answer based on the context above. If the question relates to visual elements, describe what you can understand from the visual descriptions provided."""

	response = text_client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=400,
	temperature=0.4
	)

	answer = response["choices"][0]["message"]["content"].strip()
	return answer

	except Exception as e:
	return f"❌ Error generating answer: {str(e)}"

	def analyze_document_structure():
	"""
	New feature: Analyze the overall structure of the document
	"""
	global pdf_text, extracted_images

	if not pdf_text and not extracted_images:
	return "❌ Please upload and process a PDF first."

	try:
	structure_prompt = f"""
	Analyze the structure and organization of this document. Provide insights about:
	1. Document type and purpose
	2. Main sections and topics
	3. Visual elements present ({len(extracted_images)} images/tables/charts)
	4. Key information hierarchy
	5. Overall document quality and completeness

	Text content sample: {pdf_text[:1000]}
	Visual elements: {len(extracted_images)} items detected

	Provide a structural analysis:
	"""

	response = text_client.chat_completion(
	messages=[{"role": "user", "content": structure_prompt}],
	max_tokens=300,
	temperature=0.3
	)

	return response["choices"][0]["message"]["content"].strip()

	except Exception as e:
	return f"❌ Error analyzing structure: {str(e)}"

	# [Previous functions remain the same: generate_summary_multimodal, extract_keywords_multimodal, show_extracted_images, clear_interface_multimodal]

	def generate_summary_multimodal():
	"""Enhanced summary generation considering both text and visual content"""
	global pdf_text, extracted_images

	if not pdf_text and not extracted_images:
	return "❌ Please upload and process a PDF first."

	try:
	content_parts = []

	if pdf_text:
	content_parts.append(f"Text Content:\n{pdf_text[:2000]}")

	if extracted_images:
	visual_summary = "\n".join([img['description'][:200] for img in extracted_images[:3]])
	content_parts.append(f"Visual Content:\n{visual_summary}")

	combined_content = "\n\n".join(content_parts)

	prompt = f"""Provide a comprehensive summary of this document that includes both textual and visual elements.
	Focus on key findings, main topics, and insights from charts, tables, or images.

	Content: {combined_content}

	Summary:"""

	response = text_client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=250,
	temperature=0.5
	)

	return response["choices"][0]["message"]["content"].strip()

	except Exception as e:
	return f"❌ Error generating summary: {str(e)}"

	def extract_keywords_multimodal():
	"""Enhanced keyword extraction from both text and visual content"""
	global pdf_text, extracted_images

	if not pdf_text and not extracted_images:
	return "❌ Please upload and process a PDF first."

	try:
	content_parts = []

	if pdf_text:
	content_parts.append(f"Text: {pdf_text[:1500]}")

	if extracted_images:
	visual_content = "\n".join([img['description'][:150] for img in extracted_images])
	content_parts.append(f"Visual Content: {visual_content}")

	combined_content = "\n\n".join(content_parts)

	prompt = f"""Extract key terms, concepts, and topics from this document content.
	Include technical terms, important concepts, and themes from both text and visual elements.

	Content: {combined_content}

	Key terms and concepts:"""

	response = text_client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=120,
	temperature=0.5
	)

	return response["choices"][0]["message"]["content"].strip()

	except Exception as e:
	return f"❌ Error extracting keywords: {str(e)}"

	def show_extracted_images():
	"""Display information about extracted images"""
	global extracted_images

	if not extracted_images:
	return "No visual elements extracted from the current document."

	info = f"📊 Extracted {len(extracted_images)} visual elements:\n\n"
	for i, img in enumerate(extracted_images, 1):
	element_type = "📊 Table" if img['type'] == 'table' else "🖼️ Image"
	info += f"{i}. {element_type}: {img['filename']}\n"
	info += f" Description: {img['description'][:150]}...\n\n"

	if i >= 5: # Limit display to first 5
	remaining = len(extracted_images) - 5
	if remaining > 0:
	info += f"... and {remaining} more visual elements."
	break

	return info

	def clear_interface_multimodal():
	"""Enhanced clear function for multimodal system"""
	global index, retriever, current_pdf_name, pdf_text, extracted_images

	index = retriever = None
	current_pdf_name = pdf_text = None
	extracted_images = []

	if os.path.exists(FIGURES_DIR):
	for file in os.listdir(FIGURES_DIR):
	try:
	os.remove(os.path.join(FIGURES_DIR, file))
	except:
	pass

	return None, "", gr.update(interactive=False), "", "", "", "", ""

	# Enhanced Gradio UI
	theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue")

	with gr.Blocks(theme=theme, css="""
	.container { border-radius: 10px; padding: 15px; }
	.pdf-active { border-left: 3px solid #6366f1; padding-left: 10px; background-color: rgba(99,102,241,0.1); }
	.footer { text-align: center; margin-top: 30px; font-size: 0.8em; color: #666; }
	.main-title {
	text-align: center;
	font-size: 56px;
	font-weight: bold;
	margin-bottom: 20px;
	background: linear-gradient(45deg, #6366f1, #8b5cf6, #ec4899);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}
	.feature-badge {
	background: linear-gradient(45deg, #10b981, #3b82f6);
	color: white;
	padding: 4px 12px;
	border-radius: 15px;
	font-size: 11px;
	margin: 2px;
	display: inline-block;
	}
	""") as demo:

	gr.Markdown("<div class='main-title'>🤖 DocQueryAI Pro</div>")
	gr.Markdown("""
	<div style='text-align: center; margin-bottom: 25px;'>
	<span class='feature-badge'>🔍 Advanced RAG</span>
	<span class='feature-badge'>🖼️ Vision AI</span>
	<span class='feature-badge'>📊 Table Analysis</span>
	<span class='feature-badge'>📈 Chart Understanding</span>
	<span class='feature-badge'>🧠 Smart Retrieval</span>
	</div>
	""")

	with gr.Row():
	with gr.Column():
	gr.Markdown("## 📄 Document Processing")
	pdf_display = gr.Textbox(label="Active Document", interactive=False, elem_classes="pdf-active")
	pdf_file = gr.File(file_types=[".pdf"], type="filepath", label="Upload PDF Document")
	upload_button = gr.Button("🚀 Process with AI Vision", variant="primary", size="lg")
	status_box = gr.Textbox(label="Processing Status", interactive=False, lines=3)

	with gr.Column():
	gr.Markdown("## 💬 Intelligent Q&A")
	gr.Markdown("Ask about any content: text, images, charts, tables, or data visualizations")
	question_input = gr.Textbox(
	lines=3,
	placeholder="Examples:\n• What does the chart show?\n• Summarize the table data\n• Explain the main findings",
	label="Your Question"
	)
	ask_button = gr.Button("🔍 Get AI Answer", variant="primary", size="lg")
	answer_output = gr.Textbox(label="AI Response", lines=8, interactive=False)

	with gr.Row():
	with gr.Column():
	summary_button = gr.Button("📋 Generate Summary", variant="secondary")
	summary_output = gr.Textbox(label="Document Summary", lines=5, interactive=False)

	with gr.Column():
	keywords_button = gr.Button("🏷️ Extract Keywords", variant="secondary")
	keywords_output = gr.Textbox(label="Key Concepts", lines=5, interactive=False)

	with gr.Row():
	with gr.Column():
	structure_button = gr.Button("🏗️ Analyze Structure", variant="secondary")
	structure_output = gr.Textbox(label="Document Structure Analysis", lines=5, interactive=False)

	with gr.Column():
	images_button = gr.Button("🖼️ Show Visual Elements", variant="secondary")
	images_output = gr.Textbox(label="Extracted Visual Elements", lines=5, interactive=False)

	with gr.Row():
	clear_button = gr.Button("🗑️ Clear All", variant="secondary", size="sm")

	gr.Markdown("""
	<div class='footer'>
	🚀 <strong>Powered by Advanced AI</strong><br>
	🔧 HuggingFace Transformers • LangChain • FAISS • Unstructured<br>
	🎯 Multimodal RAG: Text + Vision + Tables + Charts
	</div>
	""")

	# Event bindings
	upload_button.click(process_pdf_multimodal_advanced, [pdf_file], [pdf_display, status_box, question_input])
	ask_button.click(ask_question_multimodal_advanced, [pdf_display, question_input], answer_output)
	summary_button.click(generate_summary_multimodal, [], summary_output)
	keywords_button.click(extract_keywords_multimodal, [], keywords_output)
	structure_button.click(analyze_document_structure, [], structure_output)
	images_button.click(show_extracted_images, [], images_output)
	clear_button.click(clear_interface_multimodal, [], [pdf_file, pdf_display, question_input, answer_output, summary_output, keywords_output, structure_output, images_output])

	if __name__ == "__main__":
	demo.launch(debug=True, share=True)