File size: 1,248 Bytes
8e2a4c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pdfplumber
# Note: The Gemini call logic is now centralized in gemini_utils.py
# This file is now only for PDF-specific processing.

def preprocess_pdf_text(pdf_path: str) -> str:
    """
    Extracts all text from all pages of the PDF using pdfplumber,
    attempting to preserve layout for better LLM understanding.
    """
    processed_text_parts = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                page_text = page.extract_text(x_tolerance=2, y_tolerance=2, layout=True)
                if page_text:
                    processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n{page_text}")
                else:
                    processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n[No text extracted from this page]")

        full_text = "".join(processed_text_parts)
        return full_text
    except Exception as e:
        error_msg = f"Error processing PDF with pdfplumber: {type(e).__name__} - {e}"
        print(error_msg)
        return error_msg

# We are keeping the get_structured_data_with_gemini call in the main app flow
# but importing it from gemini_utils to keep API calls together.
from gemini_utils import get_structured_data_with_gemini