Spaces:
Sleeping
Sleeping
import gradio as gr | |
import fitz # PyMuPDF | |
import requests | |
import os | |
import tempfile | |
import base64 | |
from typing import Optional, Tuple | |
# OCR.space API configuration | |
OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here') | |
OCR_API_URL = 'https://api.ocr.space/parse/image' | |
def extract_text_with_ocr(pdf_file_path: str) -> str: | |
"""Extract text using OCR.space API as fallback""" | |
try: | |
# Convert PDF to image first (using first page) | |
doc = fitz.open(pdf_file_path) | |
page = doc[0] # Get first page | |
# Convert page to image | |
mat = fitz.Matrix(2.0, 2.0) # Higher resolution | |
pix = page.get_pixmap(matrix=mat) | |
img_data = pix.tobytes("png") | |
doc.close() | |
# Encode image to base64 | |
img_base64 = base64.b64encode(img_data).decode('utf-8') | |
# Prepare OCR.space API request | |
payload = { | |
'apikey': OCR_API_KEY, | |
'language': 'eng', | |
'isOverlayRequired': False, | |
'base64Image': f'data:image/png;base64,{img_base64}', | |
'iscreatesearchablepdf': False, | |
'issearchablepdfhidetextlayer': False | |
} | |
# Make API request | |
response = requests.post(OCR_API_URL, data=payload, timeout=60) | |
if response.status_code == 200: | |
result = response.json() | |
if result.get('IsErroredOnProcessing', False): | |
return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}" | |
parsed_results = result.get('ParsedResults', []) | |
if parsed_results: | |
return parsed_results[0].get('ParsedText', 'No text found') | |
else: | |
return "No text extracted from OCR" | |
else: | |
return f"OCR API Error: {response.status_code}" | |
except Exception as e: | |
return f"OCR processing error: {str(e)}" | |
def extract_text_from_pdf(pdf_file) -> Tuple[str, str]: | |
"""Extract text from uploaded PDF file with OCR fallback""" | |
if pdf_file is None: | |
return "No file uploaded", "β Error" | |
status = "β Success" | |
try: | |
# Primary method: PyMuPDF text extraction | |
doc = fitz.open(pdf_file.name) | |
text = "" | |
# Extract text from each page | |
for page_num, page in enumerate(doc): | |
page_text = page.get_text("text") | |
if page_text.strip(): | |
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n" | |
doc.close() | |
# If we got meaningful text, return it | |
if text.strip() and len(text.strip()) > 50: # Arbitrary threshold | |
return text.strip(), status | |
# If no text or very little text, try OCR fallback | |
status = "β οΈ Using OCR (Image-based PDF detected)" | |
# Check if OCR API key is configured | |
if OCR_API_KEY == 'your_ocr_space_api_key_here': | |
return ("No extractable text found. This appears to be an image-based PDF.\n" | |
"To extract text from image-based PDFs, please:\n" | |
"1. Get a free API key from https://ocr.space/ocrapi\n" | |
"2. Set the OCR_API_KEY environment variable\n" | |
"3. Restart the application"), "β OCR Not Configured" | |
# Try OCR extraction | |
ocr_text = extract_text_with_ocr(pdf_file.name) | |
if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"): | |
return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "β OCR Failed" | |
return f"Extracted using OCR:\n\n{ocr_text}", status | |
except Exception as e: | |
# Complete fallback error handling | |
error_msg = f"Error processing PDF: {str(e)}" | |
# Try to provide helpful error messages | |
if "No such file" in str(e): | |
error_msg = "File not found. Please try uploading the PDF again." | |
elif "not a PDF" in str(e): | |
error_msg = "Invalid file format. Please upload a valid PDF file." | |
elif "encrypted" in str(e).lower(): | |
error_msg = "This PDF is password-protected. Please provide an unlocked PDF." | |
elif "corrupted" in str(e).lower(): | |
error_msg = "This PDF file appears to be corrupted. Please try a different file." | |
return error_msg, "β Error" | |
def clear_output(): | |
"""Clear the output textbox""" | |
return "", "π Ready" | |
# Create the Gradio interface | |
with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# π PDF Text Extraction App") | |
gr.Markdown(""" | |
Upload a PDF file to extract its text content. | |
**Features:** | |
- β Direct text extraction from text-based PDFs | |
- π OCR fallback for image-based PDFs (requires OCR.space API key) | |
- π Status indicators for extraction method used | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
pdf_input = gr.File( | |
label="π Upload PDF File", | |
file_types=[".pdf"], | |
type="filepath" | |
) | |
with gr.Row(): | |
extract_btn = gr.Button("π Extract Text", variant="primary", size="lg") | |
clear_btn = gr.Button("ποΈ Clear", variant="secondary") | |
# Status indicator | |
status_output = gr.Textbox( | |
label="Status", | |
value="π Ready", | |
interactive=False, | |
max_lines=1 | |
) | |
# OCR Configuration info | |
gr.Markdown(""" | |
**OCR Configuration:** | |
Set `OCR_API_KEY` environment variable for image-based PDF support. | |
Get free API key at: https://ocr.space/ocrapi | |
""") | |
with gr.Column(scale=2): | |
text_output = gr.Textbox( | |
label="π Extracted Text", | |
lines=25, | |
max_lines=50, | |
placeholder="Extracted text will appear here...", | |
show_copy_button=True | |
) | |
# Event handlers | |
extract_btn.click( | |
fn=extract_text_from_pdf, | |
inputs=pdf_input, | |
outputs=[text_output, status_output] | |
) | |
clear_btn.click( | |
fn=clear_output, | |
outputs=[text_output, status_output] | |
) | |
# Auto-extract when file is uploaded | |
pdf_input.change( | |
fn=extract_text_from_pdf, | |
inputs=pdf_input, | |
outputs=[text_output, status_output] | |
) | |
# Footer | |
gr.Markdown(""" | |
--- | |
**Tips:** | |
- For best results with image-based PDFs, ensure good image quality | |
- Large PDFs may take longer to process | |
- OCR works best with clear, high-contrast text | |
""") | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False, | |
debug=True | |
) |