File size: 7,051 Bytes
625982b
c1ce0f7
2d31420
 
 
 
 
c1ce0f7
2d31420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625982b
2d31420
 
 
625982b
 
2d31420
625982b
 
 
 
2d31420
 
 
 
625982b
 
 
2d31420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625982b
 
2d31420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1ce0f7
625982b
2d31420
625982b
2d31420
 
 
 
 
 
 
 
625982b
 
2d31420
625982b
2d31420
625982b
 
 
2d31420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625982b
2d31420
625982b
2d31420
 
 
 
 
625982b
 
2d31420
625982b
 
 
2d31420
625982b
 
2d31420
 
 
 
 
 
625982b
 
 
2d31420
625982b
2d31420
 
 
 
 
 
 
 
 
c1ce0f7
625982b
c1ce0f7
2d31420
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import gradio as gr
import fitz  # PyMuPDF
import requests
import os
import tempfile
import base64
from typing import Optional, Tuple

# OCR.space API configuration
OCR_API_KEY = os.getenv('OCR_API_KEY', 'your_ocr_space_api_key_here')
OCR_API_URL = 'https://api.ocr.space/parse/image'

def extract_text_with_ocr(pdf_file_path: str) -> str:
    """Extract text using OCR.space API as fallback"""
    try:
        # Convert PDF to image first (using first page)
        doc = fitz.open(pdf_file_path)
        page = doc[0]  # Get first page
        
        # Convert page to image
        mat = fitz.Matrix(2.0, 2.0)  # Higher resolution
        pix = page.get_pixmap(matrix=mat)
        img_data = pix.tobytes("png")
        doc.close()
        
        # Encode image to base64
        img_base64 = base64.b64encode(img_data).decode('utf-8')
        
        # Prepare OCR.space API request
        payload = {
            'apikey': OCR_API_KEY,
            'language': 'eng',
            'isOverlayRequired': False,
            'base64Image': f'data:image/png;base64,{img_base64}',
            'iscreatesearchablepdf': False,
            'issearchablepdfhidetextlayer': False
        }
        
        # Make API request
        response = requests.post(OCR_API_URL, data=payload, timeout=60)
        
        if response.status_code == 200:
            result = response.json()
            if result.get('IsErroredOnProcessing', False):
                return f"OCR Error: {result.get('ErrorMessage', 'Unknown error')}"
            
            parsed_results = result.get('ParsedResults', [])
            if parsed_results:
                return parsed_results[0].get('ParsedText', 'No text found')
            else:
                return "No text extracted from OCR"
        else:
            return f"OCR API Error: {response.status_code}"
            
    except Exception as e:
        return f"OCR processing error: {str(e)}"

def extract_text_from_pdf(pdf_file) -> Tuple[str, str]:
    """Extract text from uploaded PDF file with OCR fallback"""
    if pdf_file is None:
        return "No file uploaded", "❌ Error"
    
    status = "βœ… Success"
    
    try:
        # Primary method: PyMuPDF text extraction
        doc = fitz.open(pdf_file.name)
        text = ""
        
        # Extract text from each page
        for page_num, page in enumerate(doc):
            page_text = page.get_text("text")
            if page_text.strip():
                text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
        
        doc.close()
        
        # If we got meaningful text, return it
        if text.strip() and len(text.strip()) > 50:  # Arbitrary threshold
            return text.strip(), status
        
        # If no text or very little text, try OCR fallback
        status = "⚠️ Using OCR (Image-based PDF detected)"
        
        # Check if OCR API key is configured
        if OCR_API_KEY == 'your_ocr_space_api_key_here':
            return ("No extractable text found. This appears to be an image-based PDF.\n"
                   "To extract text from image-based PDFs, please:\n"
                   "1. Get a free API key from https://ocr.space/ocrapi\n"
                   "2. Set the OCR_API_KEY environment variable\n"
                   "3. Restart the application"), "❌ OCR Not Configured"
        
        # Try OCR extraction
        ocr_text = extract_text_with_ocr(pdf_file.name)
        
        if ocr_text.startswith("OCR Error:") or ocr_text.startswith("OCR processing error:"):
            return f"Primary extraction failed, OCR fallback error:\n{ocr_text}", "❌ OCR Failed"
        
        return f"Extracted using OCR:\n\n{ocr_text}", status
        
    except Exception as e:
        # Complete fallback error handling
        error_msg = f"Error processing PDF: {str(e)}"
        
        # Try to provide helpful error messages
        if "No such file" in str(e):
            error_msg = "File not found. Please try uploading the PDF again."
        elif "not a PDF" in str(e):
            error_msg = "Invalid file format. Please upload a valid PDF file."
        elif "encrypted" in str(e).lower():
            error_msg = "This PDF is password-protected. Please provide an unlocked PDF."
        elif "corrupted" in str(e).lower():
            error_msg = "This PDF file appears to be corrupted. Please try a different file."
        
        return error_msg, "❌ Error"

def clear_output():
    """Clear the output textbox"""
    return "", "πŸ”„ Ready"

# Create the Gradio interface
with gr.Blocks(title="PDF Text Extraction App", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸ“„ PDF Text Extraction App")
    gr.Markdown("""
    Upload a PDF file to extract its text content. 
    
    **Features:**
    - βœ… Direct text extraction from text-based PDFs
    - πŸ” OCR fallback for image-based PDFs (requires OCR.space API key)
    - πŸ“Š Status indicators for extraction method used
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(
                label="πŸ“Ž Upload PDF File",
                file_types=[".pdf"],
                type="filepath"
            )
            
            with gr.Row():
                extract_btn = gr.Button("πŸ” Extract Text", variant="primary", size="lg")
                clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
            
            # Status indicator
            status_output = gr.Textbox(
                label="Status",
                value="πŸ”„ Ready",
                interactive=False,
                max_lines=1
            )
            
            # OCR Configuration info
            gr.Markdown("""
            **OCR Configuration:**
            Set `OCR_API_KEY` environment variable for image-based PDF support.
            Get free API key at: https://ocr.space/ocrapi
            """)
        
        with gr.Column(scale=2):
            text_output = gr.Textbox(
                label="πŸ“ Extracted Text",
                lines=25,
                max_lines=50,
                placeholder="Extracted text will appear here...",
                show_copy_button=True
            )
    
    # Event handlers
    extract_btn.click(
        fn=extract_text_from_pdf,
        inputs=pdf_input,
        outputs=[text_output, status_output]
    )
    
    clear_btn.click(
        fn=clear_output,
        outputs=[text_output, status_output]
    )
    
    # Auto-extract when file is uploaded
    pdf_input.change(
        fn=extract_text_from_pdf,
        inputs=pdf_input,
        outputs=[text_output, status_output]
    )
    
    # Footer
    gr.Markdown("""
    ---
    **Tips:**
    - For best results with image-based PDFs, ensure good image quality
    - Large PDFs may take longer to process
    - OCR works best with clear, high-contrast text
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=True
    )