okewunmi's picture
Update app.py
2d31420 verified
import gradio as gr
import fitz # PyMuPDF
import requests
import os
import tempfile
import base64
from typing import Optional, Tuple
# OCR.space API configuration
OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here')
OCR_API_URL = 'https://api.ocr.space/parse/image'
def extract_text_with_ocr(pdf_file_path: str) -> str:
"""Extract text using OCR.space API as fallback"""
try:
# Convert PDF to image first (using first page)
doc = fitz.open(pdf_file_path)
page = doc[0] # Get first page
# Convert page to image
mat = fitz.Matrix(2.0, 2.0) # Higher resolution
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
doc.close()
# Encode image to base64
img_base64 = base64.b64encode(img_data).decode('utf-8')
# Prepare OCR.space API request
payload = {
'apikey': OCR_API_KEY,
'language': 'eng',
'isOverlayRequired': False,
'base64Image': f'data:image/png;base64,{img_base64}',
'iscreatesearchablepdf': False,
'issearchablepdfhidetextlayer': False
}
# Make API request
response = requests.post(OCR_API_URL, data=payload, timeout=60)
if response.status_code == 200:
result = response.json()
if result.get('IsErroredOnProcessing', False):
return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}"
parsed_results = result.get('ParsedResults', [])
if parsed_results:
return parsed_results[0].get('ParsedText', 'No text found')
else:
return "No text extracted from OCR"
else:
return f"OCR API Error: {response.status_code}"
except Exception as e:
return f"OCR processing error: {str(e)}"
def extract_text_from_pdf(pdf_file) -> Tuple[str, str]:
"""Extract text from uploaded PDF file with OCR fallback"""
if pdf_file is None:
return "No file uploaded", "❌ Error"
status = "βœ… Success"
try:
# Primary method: PyMuPDF text extraction
doc = fitz.open(pdf_file.name)
text = ""
# Extract text from each page
for page_num, page in enumerate(doc):
page_text = page.get_text("text")
if page_text.strip():
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
doc.close()
# If we got meaningful text, return it
if text.strip() and len(text.strip()) > 50: # Arbitrary threshold
return text.strip(), status
# If no text or very little text, try OCR fallback
status = "⚠️ Using OCR (Image-based PDF detected)"
# Check if OCR API key is configured
if OCR_API_KEY == 'your_ocr_space_api_key_here':
return ("No extractable text found. This appears to be an image-based PDF.\n"
"To extract text from image-based PDFs, please:\n"
"1. Get a free API key from https://ocr.space/ocrapi\n"
"2. Set the OCR_API_KEY environment variable\n"
"3. Restart the application"), "❌ OCR Not Configured"
# Try OCR extraction
ocr_text = extract_text_with_ocr(pdf_file.name)
if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"):
return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "❌ OCR Failed"
return f"Extracted using OCR:\n\n{ocr_text}", status
except Exception as e:
# Complete fallback error handling
error_msg = f"Error processing PDF: {str(e)}"
# Try to provide helpful error messages
if "No such file" in str(e):
error_msg = "File not found. Please try uploading the PDF again."
elif "not a PDF" in str(e):
error_msg = "Invalid file format. Please upload a valid PDF file."
elif "encrypted" in str(e).lower():
error_msg = "This PDF is password-protected. Please provide an unlocked PDF."
elif "corrupted" in str(e).lower():
error_msg = "This PDF file appears to be corrupted. Please try a different file."
return error_msg, "❌ Error"
def clear_output():
"""Clear the output textbox"""
return "", "πŸ”„ Ready"
# Create the Gradio interface
with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo:
gr.Markdown("# πŸ“„ PDF Text Extraction App")
gr.Markdown("""
Upload a PDF file to extract its text content.
**Features:**
- βœ… Direct text extraction from text-based PDFs
- πŸ” OCR fallback for image-based PDFs (requires OCR.space API key)
- πŸ“Š Status indicators for extraction method used
""")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="πŸ“Ž Upload PDF File",
file_types=[".pdf"],
type="filepath"
)
with gr.Row():
extract_btn = gr.Button("πŸ” Extract Text", variant="primary", size="lg")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
# Status indicator
status_output = gr.Textbox(
label="Status",
value="πŸ”„ Ready",
interactive=False,
max_lines=1
)
# OCR Configuration info
gr.Markdown("""
**OCR Configuration:**
Set `OCR_API_KEY` environment variable for image-based PDF support.
Get free API key at: https://ocr.space/ocrapi
""")
with gr.Column(scale=2):
text_output = gr.Textbox(
label="πŸ“ Extracted Text",
lines=25,
max_lines=50,
placeholder="Extracted text will appear here...",
show_copy_button=True
)
# Event handlers
extract_btn.click(
fn=extract_text_from_pdf,
inputs=pdf_input,
outputs=[text_output, status_output]
)
clear_btn.click(
fn=clear_output,
outputs=[text_output, status_output]
)
# Auto-extract when file is uploaded
pdf_input.change(
fn=extract_text_from_pdf,
inputs=pdf_input,
outputs=[text_output, status_output]
)
# Footer
gr.Markdown("""
---
**Tips:**
- For best results with image-based PDFs, ensure good image quality
- Large PDFs may take longer to process
- OCR works best with clear, high-contrast text
""")
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=True
)