import gradio as gr import fitz # PyMuPDF import requests import os import tempfile import base64 from typing import Optional, Tuple # OCR.space API configuration OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here') OCR_API_URL = 'https://api.ocr.space/parse/image' def extract_text_with_ocr(pdf_file_path: str) -> str: """Extract text using OCR.space API as fallback""" try: # Convert PDF to image first (using first page) doc = fitz.open(pdf_file_path) page = doc[0] # Get first page # Convert page to image mat = fitz.Matrix(2.0, 2.0) # Higher resolution pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") doc.close() # Encode image to base64 img_base64 = base64.b64encode(img_data).decode('utf-8') # Prepare OCR.space API request payload = { 'apikey': OCR_API_KEY, 'language': 'eng', 'isOverlayRequired': False, 'base64Image': f'data:image/png;base64,{img_base64}', 'iscreatesearchablepdf': False, 'issearchablepdfhidetextlayer': False } # Make API request response = requests.post(OCR_API_URL, data=payload, timeout=60) if response.status_code == 200: result = response.json() if result.get('IsErroredOnProcessing', False): return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}" parsed_results = result.get('ParsedResults', []) if parsed_results: return parsed_results[0].get('ParsedText', 'No text found') else: return "No text extracted from OCR" else: return f"OCR API Error: {response.status_code}" except Exception as e: return f"OCR processing error: {str(e)}" def extract_text_from_pdf(pdf_file) -> Tuple[str, str]: """Extract text from uploaded PDF file with OCR fallback""" if pdf_file is None: return "No file uploaded", "❌ Error" status = "✅ Success" try: # Primary method: PyMuPDF text extraction doc = fitz.open(pdf_file.name) text = "" # Extract text from each page for page_num, page in enumerate(doc): page_text = page.get_text("text") if page_text.strip(): text += f"\n--- Page {page_num + 1} ---\n{page_text}\n" doc.close() # If we got meaningful text, return it if text.strip() and len(text.strip()) > 50: # Arbitrary threshold return text.strip(), status # If no text or very little text, try OCR fallback status = "⚠️ Using OCR (Image-based PDF detected)" # Check if OCR API key is configured if OCR_API_KEY == 'your_ocr_space_api_key_here': return ("No extractable text found. This appears to be an image-based PDF.\n" "To extract text from image-based PDFs, please:\n" "1. Get a free API key from https://ocr.space/ocrapi\n" "2. Set the OCR_API_KEY environment variable\n" "3. Restart the application"), "❌ OCR Not Configured" # Try OCR extraction ocr_text = extract_text_with_ocr(pdf_file.name) if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"): return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "❌ OCR Failed" return f"Extracted using OCR:\n\n{ocr_text}", status except Exception as e: # Complete fallback error handling error_msg = f"Error processing PDF: {str(e)}" # Try to provide helpful error messages if "No such file" in str(e): error_msg = "File not found. Please try uploading the PDF again." elif "not a PDF" in str(e): error_msg = "Invalid file format. Please upload a valid PDF file." elif "encrypted" in str(e).lower(): error_msg = "This PDF is password-protected. Please provide an unlocked PDF." elif "corrupted" in str(e).lower(): error_msg = "This PDF file appears to be corrupted. Please try a different file." return error_msg, "❌ Error" def clear_output(): """Clear the output textbox""" return "", "🔄 Ready" # Create the Gradio interface with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo: gr.Markdown("# 📄 PDF Text Extraction App") gr.Markdown(""" Upload a PDF file to extract its text content. **Features:** - ✅ Direct text extraction from text-based PDFs - 🔍 OCR fallback for image-based PDFs (requires OCR.space API key) - 📊 Status indicators for extraction method used """) with gr.Row(): with gr.Column(scale=1): pdf_input = gr.File( label="📎 Upload PDF File", file_types=[".pdf"], type="filepath" ) with gr.Row(): extract_btn = gr.Button("🔍 Extract Text", variant="primary", size="lg") clear_btn = gr.Button("🗑️ Clear", variant="secondary") # Status indicator status_output = gr.Textbox( label="Status", value="🔄 Ready", interactive=False, max_lines=1 ) # OCR Configuration info gr.Markdown(""" **OCR Configuration:** Set `OCR_API_KEY` environment variable for image-based PDF support. Get free API key at: https://ocr.space/ocrapi """) with gr.Column(scale=2): text_output = gr.Textbox( label="📝 Extracted Text", lines=25, max_lines=50, placeholder="Extracted text will appear here...", show_copy_button=True ) # Event handlers extract_btn.click( fn=extract_text_from_pdf, inputs=pdf_input, outputs=[text_output, status_output] ) clear_btn.click( fn=clear_output, outputs=[text_output, status_output] ) # Auto-extract when file is uploaded pdf_input.change( fn=extract_text_from_pdf, inputs=pdf_input, outputs=[text_output, status_output] ) # Footer gr.Markdown(""" --- **Tips:** - For best results with image-based PDFs, ensure good image quality - Large PDFs may take longer to process - OCR works best with clear, high-contrast text """) # Launch the app if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True )