Spaces:

okewunmi
/

pdf-text-extraction

Sleeping

File size: 7,051 Bytes

import gradio as gr
import fitz  # PyMuPDF
import requests
import os
import tempfile
import base64
from typing import Optional, Tuple

# OCR.space API configuration
OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here')
OCR_API_URL = 'https://api.ocr.space/parse/image'

def extract_text_with_ocr(pdf_file_path: str) -> str:
    """Extract text using OCR.space API as fallback"""
    try:
        # Convert PDF to image first (using first page)
        doc = fitz.open(pdf_file_path)
        page = doc[0]  # Get first page
        
        # Convert page to image
        mat = fitz.Matrix(2.0, 2.0)  # Higher resolution
        pix = page.get_pixmap(matrix=mat)
        img_data = pix.tobytes("png")
        doc.close()
        
        # Encode image to base64
        img_base64 = base64.b64encode(img_data).decode('utf-8')
        
        # Prepare OCR.space API request
        payload = {
            'apikey': OCR_API_KEY,
            'language': 'eng',
            'isOverlayRequired': False,
            'base64Image': f'data:image/png;base64,{img_base64}',
            'iscreatesearchablepdf': False,
            'issearchablepdfhidetextlayer': False
        }
        
        # Make API request
        response = requests.post(OCR_API_URL, data=payload, timeout=60)
        
        if response.status_code == 200:
            result = response.json()
            if result.get('IsErroredOnProcessing', False):
                return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}"
            
            parsed_results = result.get('ParsedResults', [])
            if parsed_results:
                return parsed_results[0].get('ParsedText', 'No text found')
            else:
                return "No text extracted from OCR"
        else:
            return f"OCR API Error: {response.status_code}"
            
    except Exception as e:
        return f"OCR processing error: {str(e)}"

def extract_text_from_pdf(pdf_file) -> Tuple[str, str]:
    """Extract text from uploaded PDF file with OCR fallback"""
    if pdf_file is None:
        return "No file uploaded", "❌ Error"
    
    status = "✅ Success"
    
    try:
        # Primary method: PyMuPDF text extraction
        doc = fitz.open(pdf_file.name)
        text = ""
        
        # Extract text from each page
        for page_num, page in enumerate(doc):
            page_text = page.get_text("text")
            if page_text.strip():
                text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
        
        doc.close()
        
        # If we got meaningful text, return it
        if text.strip() and len(text.strip()) > 50:  # Arbitrary threshold
            return text.strip(), status
        
        # If no text or very little text, try OCR fallback
        status = "⚠️ Using OCR (Image-based PDF detected)"
        
        # Check if OCR API key is configured
        if OCR_API_KEY == 'your_ocr_space_api_key_here':
            return ("No extractable text found. This appears to be an image-based PDF.\n"
                   "To extract text from image-based PDFs, please:\n"
                   "1. Get a free API key from https://ocr.space/ocrapi\n"
                   "2. Set the OCR_API_KEY environment variable\n"
                   "3. Restart the application"), "❌ OCR Not Configured"
        
        # Try OCR extraction
        ocr_text = extract_text_with_ocr(pdf_file.name)
        
        if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"):
            return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "❌ OCR Failed"
        
        return f"Extracted using OCR:\n\n{ocr_text}", status
        
    except Exception as e:
        # Complete fallback error handling
        error_msg = f"Error processing PDF: {str(e)}"
        
        # Try to provide helpful error messages
        if "No such file" in str(e):
            error_msg = "File not found. Please try uploading the PDF again."
        elif "not a PDF" in str(e):
            error_msg = "Invalid file format. Please upload a valid PDF file."
        elif "encrypted" in str(e).lower():
            error_msg = "This PDF is password-protected. Please provide an unlocked PDF."
        elif "corrupted" in str(e).lower():
            error_msg = "This PDF file appears to be corrupted. Please try a different file."
        
        return error_msg, "❌ Error"

def clear_output():
    """Clear the output textbox"""
    return "", "🔄 Ready"

# Create the Gradio interface
with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📄 PDF Text Extraction App")
    gr.Markdown("""
    Upload a PDF file to extract its text content. 
    
    **Features:**
    - ✅ Direct text extraction from text-based PDFs
    - 🔍 OCR fallback for image-based PDFs (requires OCR.space API key)
    - 📊 Status indicators for extraction method used
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(
                label="📎 Upload PDF File",
                file_types=[".pdf"],
                type="filepath"
            )
            
            with gr.Row():
                extract_btn = gr.Button("🔍 Extract Text", variant="primary", size="lg")
                clear_btn = gr.Button("🗑️ Clear", variant="secondary")
            
            # Status indicator
            status_output = gr.Textbox(
                label="Status",
                value="🔄 Ready",
                interactive=False,
                max_lines=1
            )
            
            # OCR Configuration info
            gr.Markdown("""
            **OCR Configuration:**
            Set `OCR_API_KEY` environment variable for image-based PDF support.
            Get free API key at: https://ocr.space/ocrapi
            """)
        
        with gr.Column(scale=2):
            text_output = gr.Textbox(
                label="📝 Extracted Text",
                lines=25,
                max_lines=50,
                placeholder="Extracted text will appear here...",
                show_copy_button=True
            )
    
    # Event handlers
    extract_btn.click(
        fn=extract_text_from_pdf,
        inputs=pdf_input,
        outputs=[text_output, status_output]
    )
    
    clear_btn.click(
        fn=clear_output,
        outputs=[text_output, status_output]
    )
    
    # Auto-extract when file is uploaded
    pdf_input.change(
        fn=extract_text_from_pdf,
        inputs=pdf_input,
        outputs=[text_output, status_output]
    )
    
    # Footer
    gr.Markdown("""
    ---
    **Tips:**
    - For best results with image-based PDFs, ensure good image quality
    - Large PDFs may take longer to process
    - OCR works best with clear, high-contrast text
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=True
    )