DAR_standardiser / dar_processor.py
rameshmoorthy's picture
Create dar_processor.py
8e2a4c2 verified
import pdfplumber
# Note: The Gemini call logic is now centralized in gemini_utils.py
# This file is now only for PDF-specific processing.
def preprocess_pdf_text(pdf_path: str) -> str:
"""
Extracts all text from all pages of the PDF using pdfplumber,
attempting to preserve layout for better LLM understanding.
"""
processed_text_parts = []
try:
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
page_text = page.extract_text(x_tolerance=2, y_tolerance=2, layout=True)
if page_text:
processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n{page_text}")
else:
processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n[No text extracted from this page]")
full_text = "".join(processed_text_parts)
return full_text
except Exception as e:
error_msg = f"Error processing PDF with pdfplumber: {type(e).__name__} - {e}"
print(error_msg)
return error_msg
# We are keeping the get_structured_data_with_gemini call in the main app flow
# but importing it from gemini_utils to keep API calls together.
from gemini_utils import get_structured_data_with_gemini