|
|
|
import fitz |
|
import pymupdf4llm |
|
import os |
|
import tempfile |
|
import traceback |
|
from typing import Tuple, Optional, List, Dict, Any |
|
|
|
def convert_rect_to_dict(rect: fitz.Rect) -> Optional[Dict[str, float]]: |
|
"""Converts a fitz.Rect object to a dictionary.""" |
|
if not rect or not isinstance(rect, fitz.Rect): |
|
print(f"Warning: Invalid rect object received: {rect}") |
|
return None |
|
return { |
|
"x0": rect.x0, |
|
"y0": rect.y0, |
|
"x1": rect.x1, |
|
"y1": rect.y1, |
|
"width": rect.width, |
|
"height": rect.height |
|
} |
|
|
|
def try_map_issues_to_page_rects( |
|
issues_to_map_for_context: List[Dict[str, Any]], |
|
pdf_rects: List[fitz.Rect], |
|
page_number_for_mapping: int |
|
) -> int: |
|
"""Helper function for mapping LT issues to PDF rectangles.""" |
|
mapped_count = 0 |
|
num_issues_to_try = len(issues_to_map_for_context) |
|
num_available_rects = len(pdf_rects) |
|
limit = min(num_issues_to_try, num_available_rects) |
|
|
|
for i in range(limit): |
|
issue_to_update = issues_to_map_for_context[i] |
|
if issue_to_update['is_mapped_to_pdf']: |
|
continue |
|
pdf_rect = pdf_rects[i] |
|
coord_dict = convert_rect_to_dict(pdf_rect) |
|
if coord_dict: |
|
issue_to_update['pdf_coordinates_list'] = [coord_dict] |
|
issue_to_update['is_mapped_to_pdf'] = True |
|
issue_to_update['mapped_page_number'] = page_number_for_mapping |
|
mapped_count += 1 |
|
else: |
|
print(f" Warning: Could not convert rect for context '{issue_to_update['context_text'][:30]}...' on page {page_number_for_mapping}") |
|
return mapped_count |
|
|
|
def extract_pdf_text(file_input: Any) -> str: |
|
"""Extracts full text from a PDF file using PyMuPDF4LLM (as Markdown).""" |
|
temp_file_path_for_pymupdf4llm = None |
|
actual_path_to_process = None |
|
try: |
|
if isinstance(file_input, str): |
|
actual_path_to_process = file_input |
|
elif hasattr(file_input, 'read') and callable(file_input.read): |
|
temp_file_obj = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) |
|
temp_file_path_for_pymupdf4llm = temp_file_obj.name |
|
file_input.seek(0) |
|
temp_file_obj.write(file_input.read()) |
|
temp_file_obj.close() |
|
actual_path_to_process = temp_file_path_for_pymupdf4llm |
|
else: |
|
raise ValueError("Input 'file_input' must be a file path (str) or a file-like object.") |
|
|
|
doc_for_page_count = fitz.open(actual_path_to_process) |
|
page_count = len(doc_for_page_count) |
|
doc_for_page_count.close() |
|
print(f"PDF has {page_count} pages. Extracting Markdown using pymupdf4llm.") |
|
|
|
markdown_text = pymupdf4llm.to_markdown(actual_path_to_process) |
|
|
|
print(f"Total extracted Markdown text length: {len(markdown_text)} characters.") |
|
return markdown_text |
|
|
|
except Exception as e: |
|
print(f"Error extracting text from PDF: {str(e)}") |
|
traceback.print_exc() |
|
return "" |
|
finally: |
|
if temp_file_path_for_pymupdf4llm and os.path.exists(temp_file_path_for_pymupdf4llm): |
|
os.remove(temp_file_path_for_pymupdf4llm) |