Spaces:

adil9858
/

DOCSUM

Sleeping

App Files Files Community

adil9858 commited on May 5

Commit

34b887b

verified ·

1 Parent(s): 4ba60cd

Update app.py

Browse files

Files changed (1) hide show

app.py +360 -196

app.py CHANGED Viewed

@@ -6,48 +6,81 @@ import io
 import fitz  # PyMuPDF
 import tempfile
 import os
-# --- HELPER FUNCTIONS ---
-def convert_pdf_to_images(pdf_file):
-    """Convert PDF to list of PIL Images"""
     images = []
     try:
-        # Save uploaded file to a temporary file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
-            tmp_file.write(pdf_file)
-            tmp_file_path = tmp_file.name
-        # Open the PDF file
-        pdf_document = fitz.open(tmp_file_path)
-        # Iterate through each page
-        for page_num in range(len(pdf_document)):
             page = pdf_document.load_page(page_num)
-            pix = page.get_pixmap()
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             images.append(img)
-        # Clean up
         pdf_document.close()
-        os.unlink(tmp_file_path)
     except Exception as e:
-        raise gr.Error(f"Error converting PDF: {e}")
-    return images
 def image_to_base64(image):
     """Convert PIL Image to base64 string"""
     with io.BytesIO() as buffer:
         image.save(buffer, format="PNG")
         return base64.b64encode(buffer.getvalue()).decode("utf-8")
 def generate_summary(extracted_texts, api_key):
     """Generate a comprehensive summary of all extracted texts"""
     try:
-        client = OpenAI(
-            base_url="https://openrouter.ai/api/v1",
-            api_key=api_key
-        )
         summary_prompt = f"""
         You are an expert document analyst. Below are the extracted contents from multiple pages of a document.
@@ -58,226 +91,357 @@ def generate_summary(extracted_texts, api_key):
         4. Presents the information in a clear, structured format
         Extracted contents from pages:
         {extracted_texts}
         Comprehensive Summary:
         """
         response = client.chat.completions.create(
-            model="opengvlab/internvl3-14b:free",
             messages=[
                 {"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."},
                 {"role": "user", "content": summary_prompt}
             ],
-            max_tokens=2048
         )
         return response.choices[0].message.content
     except Exception as e:
-        raise gr.Error(f"Error generating summary: {e}")
-def analyze_document(api_key, user_prompt, uploaded_file):
-    """Main processing function"""
-    if not api_key:
-        raise gr.Error("Please enter your OpenRouter API key")
-    if uploaded_file is None:
-        raise gr.Error("Please upload a document")
-    images_to_analyze = []
-    file_ext = os.path.splitext(uploaded_file.name)[1].lower()
-    # Handle PDF or image
-    if file_ext == '.pdf':
-        with open(uploaded_file.name, "rb") as f:
-            pdf_data = f.read()
-        pdf_images = convert_pdf_to_images(pdf_data)
-        images_to_analyze = pdf_images  # For simplicity, using all pages
-    else:
-        image = Image.open(uploaded_file.name)
-        images_to_analyze = [image]
-    # Process each image
-    all_results = []
-    extracted_texts = []
-    for idx, image in enumerate(images_to_analyze, 1):
         try:
-            client = OpenAI(
-                base_url="https://openrouter.ai/api/v1",
-                api_key=api_key
-            )
-            image_base64 = image_to_base64(image)
-            response = client.chat.completions.create(
-                model="opengvlab/internvl3-14b:free",
-                messages=[
-                    {"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."},
-                    {"role": "user", "content": [
-                        {"type": "text", "text": user_prompt},
-                        {"type": "image_url", "image_url": {
-                            "url": f"data:image/png;base64,{image_base64}"
-                        }}
-                    ]}
-                ],
-                max_tokens=1024
-            )
-            result = response.choices[0].message.content
-            extracted_texts.append(f"### Page {idx}\n{result}\n")
-            all_results.append(f"## 📄 Page {idx} Results\n{result}\n---\n")
         except Exception as e:
-            raise gr.Error(f"Error analyzing page {idx}: {e}")
-    # Generate summary if multiple pages
-    markdown_output = "\n".join(all_results)
-    if len(extracted_texts) > 1:
-        summary = generate_summary("\n".join(extracted_texts), api_key)
-        markdown_output += f"\n## 📝 Comprehensive Summary\n{summary}\n"
-        # Add structured data section
-    return markdown_output
-# Custom CSS for dark theme with green text
-custom_css = """
-:root {
-    --primary: #00ff00;
-    --primary-50: #00ff0033;
-    --primary-100: #00ff0066;
-    --primary-200: #00ff0099;
-    --primary-300: #00ff00cc;
-    --secondary: #00cc00;
-    --secondary-50: #00cc0033;
-    --secondary-100: #00cc0066;
-    --secondary-200: #00cc0099;
-    --secondary-300: #00cc00cc;
-    --color-background-primary: #000000;
-    --color-background-secondary: #111111;
-    --color-background-tertiary: #222222;
-    --text-color: #00ff00;
-    --block-background-fill: #111111;
-    --block-border-color: #00aa00;
-    --block-label-text-color: #00ff00;
-    --block-title-text-color: #00ff00;
-    --input-background-fill: #111111;
-    --input-border-color: #00aa00;
-    --input-text-color: #00ff00;
-}
 body {
-    background-color: var(--color-background-primary) !important;
-    color: var(--text-color) !important;
 }
-.markdown-output {
     padding: 20px;
-    border-radius: 8px;
-    background: var(--color-background-secondary);
-    border: 1px solid var(--block-border-color);
-    max-height: 600px;
-    overflow-y: auto;
-    color: var(--text-color) !important;
 }
-.markdown-output h1,
-.markdown-output h2,
-.markdown-output h3 {
-    color: var(--primary) !important;
-    border-bottom: 1px solid var(--primary-300);
 }
-.markdown-output a {
-    color: var(--secondary) !important;
 }
-.markdown-output code {
-    background-color: var(--color-background-tertiary);
-    color: var(--secondary);
 }
-.markdown-output pre {
-    background-color: var(--color-background-tertiary) !important;
-    border: 1px solid var(--block-border-color);
 }
-.markdown-output ul,
-.markdown-output ol {
-    color: var(--text-color);
 }
-button {
-    background: var(--primary) !important;
-    color: black !important;
-    font-weight: bold !important;
 }
-button:hover {
-    background: var(--primary-300) !important;
 }
 """
-# Create dark theme
-dark_green_theme = gr.themes.Default(
-    primary_hue="green",
-    secondary_hue="green",
-    neutral_hue="green",
-).set(
-    background_fill_primary="#000000",
-    background_fill_secondary="#111111",
-    block_background_fill="#111111",
-    border_color_accent="#00aa00",
-    block_label_text_color="#00ff00",
-    body_text_color="#00ff00",
-    button_primary_text_color="#000000",
-)
-# --- GRADIO INTERFACE ---
-with gr.Blocks(
-    title="DocSum - Document Summarizer",
-    theme=dark_green_theme,
-    css=custom_css
-) as demo:
-    gr.Markdown("# 🧾 DocSum")
-    gr.Markdown("Document Summarizer Powered by VLM • Developed by [Koshur AI](https://koshurai.com)")
     with gr.Row():
-        api_key = gr.Textbox(
-            label="🔑 OpenRouter API Key",
-            type="password",
-            placeholder="Enter your OpenRouter API key"
-        )
-        user_prompt = gr.Textbox(
             label="📝 Enter Your Prompt",
             value="Extract all content structurally",
-            placeholder="What would you like to extract?"
         )
-    uploaded_file = gr.File(
-        label="Upload Document (PDF/Image)",
-        file_types=[".pdf", ".jpg", ".jpeg", ".png"]
     )
-    submit_btn = gr.Button("🔍 Analyze Document", variant="primary")
-    # Markdown output with custom class
-    output = gr.Markdown(
-        label="Analysis Results",
-        elem_classes=["markdown-output"]
     )
-    submit_btn.click(
         fn=analyze_document,
-        inputs=[api_key, user_prompt, uploaded_file],
-        outputs=output
     )
 if __name__ == "__main__":
-    demo.launch()

 import fitz  # PyMuPDF
 import tempfile
 import os
+import shutil # Added for cleaning up temp dirs
+# --- OPENAI CLIENT SETUP ---
+# Use environment variable or textbox for API key for better security in deployed apps
+# client = OpenAI(
+#     base_url="https://openrouter.ai/api/v1",
+#     api_key=os.getenv("OPENROUTER_API_KEY") # Recommended approach
+# )
+# For this example, we'll get the key from the input field
+def get_openai_client(api_key):
+    """Initializes and returns the OpenAI client."""
+    if not api_key:
+        # Handle case where API key is missing (though Gradio will likely prevent this)
+        raise ValueError("API key is required.")
+    return OpenAI(
+        base_url="https://openrouter.ai/api/v1",
+        api_key=api_key
+    )
+def convert_pdf_to_images(pdf_path):
+    """Convert PDF file path to list of PIL Images and return the images,
+    and a list of temporary image file paths."""
     images = []
+    temp_image_paths = []
+    temp_dir = None
     try:
+        pdf_document = fitz.open(pdf_path)
+        num_pages = len(pdf_document)
+        # Create a temporary directory for images
+        temp_dir = tempfile.mkdtemp()
+        for page_num in range(num_pages):
             page = pdf_document.load_page(page_num)
+            # Render at a higher DPI for better clarity for VLM
+            pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             images.append(img)
+            # Save image to temp directory for Gradio preview/processing later
+            temp_img_path = os.path.join(temp_dir, f"page_{page_num+1}.png")
+            img.save(temp_img_path, format="PNG")
+            temp_image_paths.append(temp_img_path)
         pdf_document.close()
+        return images, temp_image_paths, num_pages, temp_dir
     except Exception as e:
+        print(f"Error converting PDF: {e}")
+        # Clean up temp dir if it was created
+        if temp_dir and os.path.exists(temp_dir):
+             shutil.rmtree(temp_dir)
+        return [], [], 0, None
 def image_to_base64(image):
     """Convert PIL Image to base64 string"""
+    # Ensure image is RGB (some images might be RGBA, etc.)
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
     with io.BytesIO() as buffer:
+        # Using PNG as it's lossless and well-supported
         image.save(buffer, format="PNG")
         return base64.b64encode(buffer.getvalue()).decode("utf-8")
 def generate_summary(extracted_texts, api_key):
     """Generate a comprehensive summary of all extracted texts"""
+    if not extracted_texts:
+        return "No content extracted to summarize."
     try:
+        client = get_openai_client(api_key)
         summary_prompt = f"""
         You are an expert document analyst. Below are the extracted contents from multiple pages of a document.
         4. Presents the information in a clear, structured format
         Extracted contents from pages:
+        ---
         {extracted_texts}
+        ---
         Comprehensive Summary:
         """
         response = client.chat.completions.create(
+            model="opengvlab/internvl3-14b:free", # Ensure this model is available via OpenRouter
             messages=[
                 {"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."},
                 {"role": "user", "content": summary_prompt}
             ],
+            max_tokens=2048 # Adjust as needed
         )
         return response.choices[0].message.content
     except Exception as e:
+        print(f"Error generating summary: {e}")
+        return f"Error generating summary: {e}"
+# --- Gradio App Functions ---
+def process_upload(file_obj):
+    """Handle file upload - converts PDF, prepares image previews, and updates state."""
+    if file_obj is None:
+        # Clear outputs
+        return None, None, [], [], "Please upload a document.", None, None, None
+    file_path = file_obj.name # Gradio's File component provides a path
+    file_type = file_obj.orig_name.split('.')[-1].lower() # Get extension from original name
+    if file_type == "pdf":
+        images, temp_paths, num_pages, temp_dir = convert_pdf_to_images(file_path)
+        if not images:
+            return None, None, [], [], "Failed to convert PDF to images.", None, None, None
+        page_options = [f"Page {i}" for i in range(1, num_pages + 1)]
+        # By default select all pages
+        default_selection = page_options
+        # Store original PIL images and temp dir in state
+        # State will hold (list of PIL images, list of temp file paths, temp directory path)
+        images_state = (images, temp_paths, temp_dir)
+        status = f"PDF uploaded. {num_pages} pages detected. Select pages to analyze."
+        # Return selected pages (as names), image previews (as paths), page options, status
+        return images_state, default_selection, temp_paths, page_options, status, None, None, None # Also return None for results and summary
+    elif file_type in ["jpg", "jpeg", "png"]:
         try:
+            img = Image.open(file_path)
+            # Ensure it's RGB
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            # Save to a temp file for Gradio preview
+            temp_dir = tempfile.mkdtemp()
+            temp_img_path = os.path.join(temp_dir, "uploaded_image.png")
+            img.save(temp_img_path, format="PNG")
+            # Store original PIL image and temp dir in state
+            # State will hold (list of PIL images, list of temp file paths, temp directory path)
+            images_state = ([img], [temp_img_path], temp_dir)
+            status = "Image uploaded."
+            # Return empty selection/options for image, but provide the single image path for preview
+            return images_state, [], [temp_img_path], [], status, None, None, None # Also return None for results and summary
         except Exception as e:
+            print(f"Error loading image: {e}")
+            # Clean up temp dir if created
+            if temp_dir and os.path.exists(temp_dir):
+                 shutil.rmtree(temp_dir)
+            return None, None, [], [], f"Failed to load image: {e}", None, None, None
+    else:
+        return None, None, [], [], "Unsupported file type. Please upload JPG, PNG, or PDF.", None, None, None
+def analyze_document(api_key, user_prompt, images_state, selected_page_names):
+    """Analyze selected images using the VLM and generate summary."""
+    if not api_key:
+        return None, None, "Please enter your Open Router API Key."
+    if not images_state or not images_state[0]: # Check if images_state exists and contains images
+        return None, None, "No document uploaded or converted."
+    all_pil_images = images_state[0]
+    temp_dir = images_state[2] # Get the temp directory path
+    images_to_analyze = []
+    extracted_texts = []
+    all_results = []
+    # Determine which images to process based on selection (or process all if image file)
+    if selected_page_names: # This indicates PDF and pages were selected
+        selected_indices = [int(name.split(" ")[1]) - 1 for name in selected_page_names]
+        images_to_analyze = [(idx + 1, all_pil_images[idx]) for idx in selected_indices if idx < len(all_pil_images)]
+    elif all_pil_images: # This indicates a single image file
+         images_to_analyze = [(1, all_pil_images[0])]
+    if not images_to_analyze:
+         # Clean up temp dir as analysis failed or no pages selected
+         if temp_dir and os.path.exists(temp_dir):
+             shutil.rmtree(temp_dir)
+         return None, None, "No pages selected for analysis."
+    try:
+        client = get_openai_client(api_key)
+        for page_num, image in images_to_analyze:
+            status_message = f"Analyzing page {page_num}..."
+            yield None, None, status_message # Update status message during processing
+            try:
+                image_base64_data = image_to_base64(image)
+                response = client.chat.completions.create(
+                    model="opengvlab/internvl3-14b:free", # Ensure this model is available via OpenRouter
+                    messages=[
+                        {"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."},
+                        {"role": "user", "content": [
+                            {"type": "text", "text": user_prompt},
+                            {"type": "image_url", "image_url": {
+                                "url": f"data:image/png;base64,{image_base64_data}"
+                            }}
+                        ]}
+                    ],
+                    max_tokens=1024 # Adjust as needed
+                )
+                result = response.choices[0].message.content
+                extracted_texts.append(f"=== Page {page_num} ===\n{result}\n")
+                if len(images_to_analyze) > 1:
+                    all_results.append(f"### 📄 Page {page_num} Result:")
+                else:
+                    all_results.append("### ✅ Analysis Result:")
+                all_results.append(result)
+                all_results.append("---")
+            except Exception as e:
+                error_msg = f"An error occurred analyzing page {page_num}: {e}"
+                print(error_msg)
+                all_results.append(f"### ❌ Error on Page {page_num}:")
+                all_results.append(error_msg)
+                all_results.append("---")
+                # Don't stop, try other pages
+        # Combine individual results
+        individual_results_markdown = "\n".join(all_results) if all_results else "No results generated."
+        # Generate and display comprehensive summary if multiple pages were processed
+        summary_text = ""
+        if len(images_to_analyze) > 1 and extracted_texts:
+            yield individual_results_markdown, None, "Generating comprehensive summary..."
+            full_extracted_text = "\n".join(extracted_texts)
+            summary_text = generate_summary(full_extracted_text, api_key)
+            status_message = "Analysis complete. Summary generated."
+        elif extracted_texts: # Single page case
+             summary_text = "Summary not generated for single page analysis. See analysis result above."
+             status_message = "Analysis complete."
+        else:
+            summary_text = "No content extracted for summary."
+            status_message = "Analysis complete, but no text extracted."
+        # Clean up the temporary directory used for images
+        if temp_dir and os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+        return individual_results_markdown, summary_text, status_message
+    except Exception as e:
+        # Clean up the temporary directory in case of error
+        if temp_dir and os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+        error_msg = f"An unhandled error occurred during analysis: {e}"
+        print(error_msg)
+        return None, None, error_msg
+# Function to clean up temp dir when session ends or is closed
+def clean_temp_dir(temp_dir):
+    if temp_dir and os.path.exists(temp_dir):
+        print(f"Cleaning up temporary directory: {temp_dir}")
+        shutil.rmtree(temp_dir)
+# --- Gradio Interface Layout ---
+# Custom CSS (simplified from Streamlit CSS)
+css = """
 body {
+    font-family: 'Inter', sans-serif;
 }
+.gradio-container {
+    max-width: 800px !important;
+    margin: auto;
     padding: 20px;
+    background-color: #f9fafb; /* Light gray background */
 }
+h1, h2, h3, h4 {
+    color: #111827; /* Darker text for headers */
 }
+.subtitle {
+    font-size: 1rem;
+    color: #6b7280; /* Gray text for subtitle */
+    margin-bottom: 2rem;
 }
+.summary-box {
+    background-color: #e0f2fe; /* Light blue background */
+    padding: 1.5rem;
+    border-radius: 8px;
+    margin-top: 1rem; /* Reduced margin-top */
+    border: 1px solid #bfdbfe; /* Light blue border */
 }
+.summary-box p {
+    margin: 0; /* Remove paragraph margin */
 }
+.file-upload-label .wrap {
+    text-align: center !important;
 }
+.gr-button {
+    margin-top: 1rem !important;
 }
+/* Style for the status message */
+#status_message_id {
+    margin-top: 1rem;
+    font-weight: bold;
+    color: #1f2937;
 }
 """
+with gr.Blocks(css=css, title="DocSum - Document Summarizer", theme=gr.themes.Soft()) as demo:
+    # State to hold images and temp paths after PDF conversion
+    # Structure: (list of PIL images, list of temp file paths for preview/analysis, temp directory path)
+    images_state = gr.State(None)
+    # State to hold the temp dir path for cleanup
+    current_temp_dir = gr.State(None)
+    gr.HTML("""
+        <div style="text-align: center;">
+            <img src='https://raw.githubusercontent.com/KoshurAI/DocSum/main/blob.png' width='100'>
+            <h1>DocSum</h1>
+            <p class="subtitle">Document Summarizer Powered by VLM • Developed by <a href="https://koshurai.com" target="_blank">Koshur AI</a></p>
+        </div>
+    """)
     with gr.Row():
+        user_prompt_input = gr.Textbox(
             label="📝 Enter Your Prompt",
             value="Extract all content structurally",
+            lines=2,
+            interactive=True,
+            container=True,
+            scale=2
         )
+        api_key_input = gr.Textbox(
+            label="🔒 OpenRouter API Key",
+            type="password",
+            interactive=True,
+            container=True,
+            scale=1,
+            info="Your key is not stored."
+            # Consider adding value=os.getenv("OPENROUTER_API_KEY", "") for easier local testing
+        )
+    file_upload = gr.File(
+        label="Upload a document (JPG/PNG/PDF)",
+        file_types=[".jpg", ".jpeg", ".png", ".pdf"],
+        interactive=True
+    )
+    # Components for PDF page selection and preview (initially hidden)
+    page_selector = gr.Checkboxgroup(
+        label="Select PDF Pages to Analyze",
+        choices=[],
+        value=[],
+        visible=False,
+        interactive=True
+    )
+    preview_gallery = gr.Gallery(
+        label="Selected Page Previews",
+        visible=False,
+        container=True,
+        preview=True, # Show previews
+        columns=3,
+        rows=1,
+        object_fit="contain",
+        height="auto"
     )
+    status_message = gr.Markdown(elem_id="status_message_id") # Use a Markdown element for status updates
+    analyze_button = gr.Button("🔍 Analyze Document")
+    # Outputs
+    individual_results_output = gr.Markdown(label="Page-by-Page Analysis Results")
+    summary_output = gr.Markdown(label="Comprehensive Document Summary", elem_classes="summary-box") # Apply CSS class
+    # --- Event Handling ---
+    # When a file is uploaded, process it (convert PDF, show previews, update state)
+    file_upload.change(
+        fn=process_upload,
+        inputs=[file_upload],
+        outputs=[images_state, page_selector, preview_gallery, page_selector.choices, status_message, individual_results_output, summary_output, current_temp_dir],
+        show_progress=True # Show Gradio's built-in progress indicator
     )
+    # When page selection changes (for PDF), update the preview gallery
+    # Note: This requires saving the temp image paths in the state from process_upload
+    page_selector.change(
+        fn=lambda selected_pages, images_state: [images_state[1][int(name.split(" ")[1]) - 1] for name in selected_pages] if images_state and images_state[1] else [],
+        inputs=[page_selector, images_state],
+        outputs=[preview_gallery],
+        show_progress=False # No need for progress bar here
+    ).then( # Chain another event to update status message
+         fn=lambda num_selected: f"{num_selected} pages selected." if num_selected > 0 else "No pages selected.",
+         inputs=[page_selector],
+         outputs=[status_message],
+         show_progress=False
+    )
+    # When the Analyze button is clicked, run the analysis function
+    analyze_button.click(
         fn=analyze_document,
+        inputs=[api_key_input, user_prompt_input, images_state, page_selector],
+        outputs=[individual_results_output, summary_output, status_message],
+        show_progress=False # We handle progress manually with status_message yield
     )
+    # --- Footer ---
+    gr.HTML("<footer style='text-align: center; margin-top: 3rem; color: #9ca3af; font-size: 0.875rem;'>© 2025 Koshur AI. All rights reserved.</footer>")
+# Clean up temp directory when the Gradio app finishes or encounters a critical error
+# Note: This might not catch all termination scenarios, especially if the server crashes unexpectedly.
+# A more robust solution for production might involve monitoring temp dirs periodically.
+# Using demo.load() to clean up at startup and demo.close() to clean up at exit.
+demo.load(fn=lambda: clean_temp_dir(current_temp_dir.value), inputs=[], outputs=[], every=10, show_progress=False) # Check & cleanup periodically (adjust interval)
+# The close event handler is tricky for cleanup; rely more on periodic check or OS cleanup.
+# --- Launch App ---
 if __name__ == "__main__":
+    # The share=True option creates a public URL (useful for testing)
+    # The debug=True option provides more detailed error messages
+    demo.launch(share=False, debug=True)
+    # You might want to add cleanup here if running locally and not sharing
+    # clean_temp_dir(current_temp_dir.value) # This won't run if the app is killed externally