Spaces:
Sleeping
Sleeping
File size: 7,051 Bytes
625982b c1ce0f7 2d31420 c1ce0f7 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 c1ce0f7 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 625982b 2d31420 c1ce0f7 625982b c1ce0f7 2d31420 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
import gradio as gr
import fitz # PyMuPDF
import requests
import os
import tempfile
import base64
from typing import Optional, Tuple
# OCR.space API configuration
OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here')
OCR_API_URL = 'https://api.ocr.space/parse/image'
def extract_text_with_ocr(pdf_file_path: str) -> str:
"""Extract text using OCR.space API as fallback"""
try:
# Convert PDF to image first (using first page)
doc = fitz.open(pdf_file_path)
page = doc[0] # Get first page
# Convert page to image
mat = fitz.Matrix(2.0, 2.0) # Higher resolution
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
doc.close()
# Encode image to base64
img_base64 = base64.b64encode(img_data).decode('utf-8')
# Prepare OCR.space API request
payload = {
'apikey': OCR_API_KEY,
'language': 'eng',
'isOverlayRequired': False,
'base64Image': f'data:image/png;base64,{img_base64}',
'iscreatesearchablepdf': False,
'issearchablepdfhidetextlayer': False
}
# Make API request
response = requests.post(OCR_API_URL, data=payload, timeout=60)
if response.status_code == 200:
result = response.json()
if result.get('IsErroredOnProcessing', False):
return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}"
parsed_results = result.get('ParsedResults', [])
if parsed_results:
return parsed_results[0].get('ParsedText', 'No text found')
else:
return "No text extracted from OCR"
else:
return f"OCR API Error: {response.status_code}"
except Exception as e:
return f"OCR processing error: {str(e)}"
def extract_text_from_pdf(pdf_file) -> Tuple[str, str]:
"""Extract text from uploaded PDF file with OCR fallback"""
if pdf_file is None:
return "No file uploaded", "β Error"
status = "β
Success"
try:
# Primary method: PyMuPDF text extraction
doc = fitz.open(pdf_file.name)
text = ""
# Extract text from each page
for page_num, page in enumerate(doc):
page_text = page.get_text("text")
if page_text.strip():
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
doc.close()
# If we got meaningful text, return it
if text.strip() and len(text.strip()) > 50: # Arbitrary threshold
return text.strip(), status
# If no text or very little text, try OCR fallback
status = "β οΈ Using OCR (Image-based PDF detected)"
# Check if OCR API key is configured
if OCR_API_KEY == 'your_ocr_space_api_key_here':
return ("No extractable text found. This appears to be an image-based PDF.\n"
"To extract text from image-based PDFs, please:\n"
"1. Get a free API key from https://ocr.space/ocrapi\n"
"2. Set the OCR_API_KEY environment variable\n"
"3. Restart the application"), "β OCR Not Configured"
# Try OCR extraction
ocr_text = extract_text_with_ocr(pdf_file.name)
if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"):
return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "β OCR Failed"
return f"Extracted using OCR:\n\n{ocr_text}", status
except Exception as e:
# Complete fallback error handling
error_msg = f"Error processing PDF: {str(e)}"
# Try to provide helpful error messages
if "No such file" in str(e):
error_msg = "File not found. Please try uploading the PDF again."
elif "not a PDF" in str(e):
error_msg = "Invalid file format. Please upload a valid PDF file."
elif "encrypted" in str(e).lower():
error_msg = "This PDF is password-protected. Please provide an unlocked PDF."
elif "corrupted" in str(e).lower():
error_msg = "This PDF file appears to be corrupted. Please try a different file."
return error_msg, "β Error"
def clear_output():
"""Clear the output textbox"""
return "", "π Ready"
# Create the Gradio interface
with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo:
gr.Markdown("# π PDF Text Extraction App")
gr.Markdown("""
Upload a PDF file to extract its text content.
**Features:**
- β
Direct text extraction from text-based PDFs
- π OCR fallback for image-based PDFs (requires OCR.space API key)
- π Status indicators for extraction method used
""")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="π Upload PDF File",
file_types=[".pdf"],
type="filepath"
)
with gr.Row():
extract_btn = gr.Button("π Extract Text", variant="primary", size="lg")
clear_btn = gr.Button("ποΈ Clear", variant="secondary")
# Status indicator
status_output = gr.Textbox(
label="Status",
value="π Ready",
interactive=False,
max_lines=1
)
# OCR Configuration info
gr.Markdown("""
**OCR Configuration:**
Set `OCR_API_KEY` environment variable for image-based PDF support.
Get free API key at: https://ocr.space/ocrapi
""")
with gr.Column(scale=2):
text_output = gr.Textbox(
label="π Extracted Text",
lines=25,
max_lines=50,
placeholder="Extracted text will appear here...",
show_copy_button=True
)
# Event handlers
extract_btn.click(
fn=extract_text_from_pdf,
inputs=pdf_input,
outputs=[text_output, status_output]
)
clear_btn.click(
fn=clear_output,
outputs=[text_output, status_output]
)
# Auto-extract when file is uploaded
pdf_input.change(
fn=extract_text_from_pdf,
inputs=pdf_input,
outputs=[text_output, status_output]
)
# Footer
gr.Markdown("""
---
**Tips:**
- For best results with image-based PDFs, ensure good image quality
- Large PDFs may take longer to process
- OCR works best with clear, high-contrast text
""")
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=True
) |