import pdfplumber # Note: The Gemini call logic is now centralized in gemini_utils.py # This file is now only for PDF-specific processing. def preprocess_pdf_text(pdf_path: str) -> str: """ Extracts all text from all pages of the PDF using pdfplumber, attempting to preserve layout for better LLM understanding. """ processed_text_parts = [] try: with pdfplumber.open(pdf_path) as pdf: for i, page in enumerate(pdf.pages): page_text = page.extract_text(x_tolerance=2, y_tolerance=2, layout=True) if page_text: processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n{page_text}") else: processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n[No text extracted from this page]") full_text = "".join(processed_text_parts) return full_text except Exception as e: error_msg = f"Error processing PDF with pdfplumber: {type(e).__name__} - {e}" print(error_msg) return error_msg # We are keeping the get_structured_data_with_gemini call in the main app flow # but importing it from gemini_utils to keep API calls together. from gemini_utils import get_structured_data_with_gemini