import gradio as gr from openai import OpenAI import base64 from PIL import Image import io import fitz # PyMuPDF import tempfile import os # --- HELPER FUNCTIONS --- def convert_pdf_to_images(pdf_file): """Convert PDF to list of PIL Images""" images = [] try: # Save uploaded file to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(pdf_file) tmp_file_path = tmp_file.name # Open the PDF file pdf_document = fitz.open(tmp_file_path) # Iterate through each page for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) pix = page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) # Clean up pdf_document.close() os.unlink(tmp_file_path) except Exception as e: raise gr.Error(f"Error converting PDF: {e}") return images def image_to_base64(image): """Convert PIL Image to base64 string""" with io.BytesIO() as buffer: image.save(buffer, format="PNG") return base64.b64encode(buffer.getvalue()).decode("utf-8") def generate_summary(extracted_texts, api_key): """Generate a comprehensive summary of all extracted texts""" try: client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=api_key ) summary_prompt = f""" You are an expert document analyst. Below are the extracted contents from multiple pages of a document. Please provide a comprehensive, detailed summary that: 1. Organizes all key information logically 2. Identifies relationships between data points 3. Highlights important figures, dates, names 4. Presents the information in a clear, structured format Extracted contents from pages: {extracted_texts} Comprehensive Summary: """ response = client.chat.completions.create( model="opengvlab/internvl3-14b:free", messages=[ {"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."}, {"role": "user", "content": summary_prompt} ], max_tokens=2048 ) return response.choices[0].message.content except Exception as e: raise gr.Error(f"Error generating summary: {e}") def analyze_document(api_key, user_prompt, uploaded_file): """Main processing function""" if not api_key: raise gr.Error("Please enter your OpenRouter API key") if uploaded_file is None: raise gr.Error("Please upload a document") images_to_analyze = [] file_ext = os.path.splitext(uploaded_file.name)[1].lower() # Handle PDF or image if file_ext == '.pdf': with open(uploaded_file.name, "rb") as f: pdf_data = f.read() pdf_images = convert_pdf_to_images(pdf_data) images_to_analyze = pdf_images # For simplicity, using all pages else: image = Image.open(uploaded_file.name) images_to_analyze = [image] # Process each image all_results = [] extracted_texts = [] for idx, image in enumerate(images_to_analyze, 1): try: client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=api_key ) image_base64 = image_to_base64(image) response = client.chat.completions.create( model="opengvlab/internvl3-14b:free", messages=[ {"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."}, {"role": "user", "content": [ {"type": "text", "text": user_prompt}, {"type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }} ]} ], max_tokens=1024 ) result = response.choices[0].message.content extracted_texts.append(f"### Page {idx}\n{result}\n") all_results.append(f"## ๐Ÿ“„ Page {idx} Results\n{result}\n---\n") except Exception as e: raise gr.Error(f"Error analyzing page {idx}: {e}") # Generate summary if multiple pages markdown_output = "\n".join(all_results) if len(extracted_texts) > 1: summary = generate_summary("\n".join(extracted_texts), api_key) markdown_output += f"\n## ๐Ÿ“ Comprehensive Summary\n{summary}\n" # Add structured data section markdown_output += f"\n## ๐Ÿ” Key Data Extracted\n" markdown_output += "- **Important Figures**: [Extracted values]\n" markdown_output += "- **Critical Dates**: [Extracted dates]\n" markdown_output += "- **Main Entities**: [Identified names/companies]\n" markdown_output += "- **Action Items**: [Key tasks identified]\n" # Add document metadata markdown_output += f"\n---\n*Document processed: {uploaded_file.name}*" return markdown_output # Custom CSS for dark theme with green text custom_css = """ :root { --primary: #00ff00; --primary-50: #00ff0033; --primary-100: #00ff0066; --primary-200: #00ff0099; --primary-300: #00ff00cc; --secondary: #00cc00; --secondary-50: #00cc0033; --secondary-100: #00cc0066; --secondary-200: #00cc0099; --secondary-300: #00cc00cc; --color-background-primary: #000000; --color-background-secondary: #111111; --color-background-tertiary: #222222; --text-color: #00ff00; --block-background-fill: #111111; --block-border-color: #00aa00; --block-label-text-color: #00ff00; --block-title-text-color: #00ff00; --input-background-fill: #111111; --input-border-color: #00aa00; --input-text-color: #00ff00; } body { background-color: var(--color-background-primary) !important; color: var(--text-color) !important; } .markdown-output { padding: 20px; border-radius: 8px; background: var(--color-background-secondary); border: 1px solid var(--block-border-color); max-height: 600px; overflow-y: auto; color: var(--text-color) !important; } .markdown-output h1, .markdown-output h2, .markdown-output h3 { color: var(--primary) !important; border-bottom: 1px solid var(--primary-300); } .markdown-output a { color: var(--secondary) !important; } .markdown-output code { background-color: var(--color-background-tertiary); color: var(--secondary); } .markdown-output pre { background-color: var(--color-background-tertiary) !important; border: 1px solid var(--block-border-color); } .markdown-output ul, .markdown-output ol { color: var(--text-color); } button { background: var(--primary) !important; color: black !important; font-weight: bold !important; } button:hover { background: var(--primary-300) !important; } """ # Create dark theme dark_green_theme = gr.themes.Default( primary_hue="green", secondary_hue="green", neutral_hue="green", ).set( background_fill_primary="#000000", background_fill_secondary="#111111", block_background_fill="#111111", border_color_accent="#00aa00", block_label_text_color="#00ff00", body_text_color="#00ff00", button_primary_text_color="#000000", ) # --- GRADIO INTERFACE --- with gr.Blocks( title="DocSum - Document Summarizer", theme=dark_green_theme, css=custom_css ) as demo: gr.Markdown("# ๐Ÿงพ DocSum") gr.Markdown("Document Summarizer Powered by VLM โ€ข Developed by [Koshur AI](https://koshurai.com)") with gr.Row(): api_key = gr.Textbox( label="๐Ÿ”‘ OpenRouter API Key", type="password", placeholder="Enter your OpenRouter API key" ) user_prompt = gr.Textbox( label="๐Ÿ“ Enter Your Prompt", value="Extract all content structurally", placeholder="What would you like to extract?" ) uploaded_file = gr.File( label="Upload Document (PDF/Image)", file_types=[".pdf", ".jpg", ".jpeg", ".png"] ) submit_btn = gr.Button("๐Ÿ” Analyze Document", variant="primary") # Markdown output with custom class output = gr.Markdown( label="Analysis Results", elem_classes=["markdown-output"] ) submit_btn.click( fn=analyze_document, inputs=[api_key, user_prompt, uploaded_file], outputs=output ) if __name__ == "__main__": demo.launch()