Spaces:
Sleeping
Sleeping
import pdfplumber | |
# Note: The Gemini call logic is now centralized in gemini_utils.py | |
# This file is now only for PDF-specific processing. | |
def preprocess_pdf_text(pdf_path: str) -> str: | |
""" | |
Extracts all text from all pages of the PDF using pdfplumber, | |
attempting to preserve layout for better LLM understanding. | |
""" | |
processed_text_parts = [] | |
try: | |
with pdfplumber.open(pdf_path) as pdf: | |
for i, page in enumerate(pdf.pages): | |
page_text = page.extract_text(x_tolerance=2, y_tolerance=2, layout=True) | |
if page_text: | |
processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n{page_text}") | |
else: | |
processed_text_parts.append(f"\n--- PAGE {i + 1} ---\n[No text extracted from this page]") | |
full_text = "".join(processed_text_parts) | |
return full_text | |
except Exception as e: | |
error_msg = f"Error processing PDF with pdfplumber: {type(e).__name__} - {e}" | |
print(error_msg) | |
return error_msg | |
# We are keeping the get_structured_data_with_gemini call in the main app flow | |
# but importing it from gemini_utils to keep API calls together. | |
from gemini_utils import get_structured_data_with_gemini |