Spaces:
Sleeping
Sleeping
File size: 1,248 Bytes
8e2a4c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import pdfplumber
# Note: The Gemini call logic is now centralized in gemini_utils.py
# This file is now only for PDF-specific processing.
def preprocess_pdf_text(pdf_path: str) -> str:
"""
Extracts all text from all pages of the PDF using pdfplumber,
attempting to preserve layout for better LLM understanding.
"""
processed_text_parts = []
try:
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
page_text = page.extract_text(x_tolerance=2, y_tolerance=2, layout=True)
if page_text:
processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n{page_text}")
else:
processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n[No text extracted from this page]")
full_text = "".join(processed_text_parts)
return full_text
except Exception as e:
error_msg = f"Error processing PDF with pdfplumber: {type(e).__name__} - {e}"
print(error_msg)
return error_msg
# We are keeping the get_structured_data_with_gemini call in the main app flow
# but importing it from gemini_utils to keep API calls together.
from gemini_utils import get_structured_data_with_gemini |