# pdf_processing.py import fitz # PyMuPDF import pymupdf4llm import os import tempfile import traceback from typing import Tuple, Optional, List, Dict, Any def convert_rect_to_dict(rect: fitz.Rect) -> Optional[Dict[str, float]]: """Converts a fitz.Rect object to a dictionary.""" if not rect or not isinstance(rect, fitz.Rect): print(f"Warning: Invalid rect object received: {rect}") return None return { "x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1, "width": rect.width, "height": rect.height } def try_map_issues_to_page_rects( issues_to_map_for_context: List[Dict[str, Any]], pdf_rects: List[fitz.Rect], page_number_for_mapping: int # 1-based page number ) -> int: """Helper function for mapping LT issues to PDF rectangles.""" mapped_count = 0 num_issues_to_try = len(issues_to_map_for_context) num_available_rects = len(pdf_rects) limit = min(num_issues_to_try, num_available_rects) for i in range(limit): issue_to_update = issues_to_map_for_context[i] if issue_to_update['is_mapped_to_pdf']: # Check the correct flag name continue pdf_rect = pdf_rects[i] coord_dict = convert_rect_to_dict(pdf_rect) if coord_dict: issue_to_update['pdf_coordinates_list'] = [coord_dict] # Store as list of dicts issue_to_update['is_mapped_to_pdf'] = True issue_to_update['mapped_page_number'] = page_number_for_mapping mapped_count += 1 else: print(f" Warning: Could not convert rect for context '{issue_to_update['context_text'][:30]}...' on page {page_number_for_mapping}") return mapped_count def extract_pdf_text(file_input: Any) -> str: """Extracts full text from a PDF file using PyMuPDF4LLM (as Markdown).""" temp_file_path_for_pymupdf4llm = None actual_path_to_process = None try: if isinstance(file_input, str): actual_path_to_process = file_input elif hasattr(file_input, 'read') and callable(file_input.read): temp_file_obj = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) temp_file_path_for_pymupdf4llm = temp_file_obj.name file_input.seek(0) temp_file_obj.write(file_input.read()) temp_file_obj.close() actual_path_to_process = temp_file_path_for_pymupdf4llm else: raise ValueError("Input 'file_input' must be a file path (str) or a file-like object.") doc_for_page_count = fitz.open(actual_path_to_process) page_count = len(doc_for_page_count) doc_for_page_count.close() print(f"PDF has {page_count} pages. Extracting Markdown using pymupdf4llm.") markdown_text = pymupdf4llm.to_markdown(actual_path_to_process) print(f"Total extracted Markdown text length: {len(markdown_text)} characters.") return markdown_text except Exception as e: print(f"Error extracting text from PDF: {str(e)}") traceback.print_exc() return "" finally: if temp_file_path_for_pymupdf4llm and os.path.exists(temp_file_path_for_pymupdf4llm): os.remove(temp_file_path_for_pymupdf4llm)