# pdf_processing.py import fitz # PyMuPDF import pymupdf4llm import os import traceback from typing import Any, Dict, List # Use standard List, Dict from collections import Counter def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None: """Converts a fitz.Rect object to a dictionary.""" if not rect or not isinstance(rect, fitz.Rect): # print(f"Warning: Invalid rect object received: {rect}") # Can be verbose return None return { "x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1, "width": rect.width, "height": rect.height } def try_map_issues_to_page_rects( issues_to_map_for_context: List[Dict[str, Any]], pdf_rects: List[fitz.Rect], page_number_for_mapping: int ) -> int: mapped_count = 0 limit = min(len(issues_to_map_for_context), len(pdf_rects)) for i in range(limit): issue_to_update = issues_to_map_for_context[i] if issue_to_update['is_mapped_to_pdf']: continue pdf_rect = pdf_rects[i] coord_dict = convert_rect_to_dict(pdf_rect) if coord_dict: issue_to_update['pdf_coordinates_list'] = [coord_dict] issue_to_update['is_mapped_to_pdf'] = True issue_to_update['mapped_page_number'] = page_number_for_mapping mapped_count += 1 return mapped_count import fitz # PyMuPDF import os import traceback from typing import Any, Dict, List from collections import Counter # Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere. import fitz # PyMuPDF import os import traceback from typing import Any, Dict, List # Use standard List, Dict from collections import Counter # Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code. def extract_majority_font_text_directly(pdf_path: str) -> str: """ Extracts text from PDF, identifies the majority font and size, and then directly assembles a plain text string containing only the text that matches this majority font, attempting to preserve basic structure. This method does NOT create an intermediate PDF document. """ original_doc = None try: # 1. Open PDF and Perform Font Analysis (similar to before) original_doc = fitz.open(pdf_path) if not original_doc.page_count: print("FontFilter (Direct): PDF has no pages.") return "" font_char_counts: Counter = Counter() pdf_basename = os.path.basename(pdf_path) print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...") # First pass: Analyze fonts to find the majority for page_num_analysis in range(original_doc.page_count): page_analysis = original_doc[page_num_analysis] # Using TEXTFLAGS_TEXT for potentially cleaner text from spans text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT) for block_analysis in text_dict_analysis.get("blocks", []): if block_analysis.get("type") == 0: # type 0 is a text block for line_analysis in block_analysis.get("lines", []): for span_analysis in line_analysis.get("spans", []): font_name = span_analysis["font"] font_size = span_analysis.get("size") if font_size is None: continue # Skip if size is not available font_size_rounded = int(round(font_size)) text = span_analysis["text"] if not text.strip(): continue # Skip purely whitespace spans font_char_counts[(font_name, font_size_rounded)] += len(text) if not font_char_counts: print("FontFilter (Direct): No text with font information found in PDF.") return "" majority_font_tuple_info = font_char_counts.most_common(1)[0] (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0] char_count_for_majority = majority_font_tuple_info[1] print( f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).") # 2. Second Pass: Extract and Assemble Text Based on Majority Font print( f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...") all_pages_collected_text = [] # List to hold text from each page (as a list of block texts) for page_num_extraction in range(original_doc.page_count): page = original_doc[page_num_extraction] # Using flags for potentially better whitespace and ligature handling in extracted text text_page_dict = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE) page_blocks_text_parts = [] # Collect text from blocks on this page for block in text_page_dict.get("blocks", []): if block.get("type") == 0: # Text block current_block_lines_text_parts = [] for line in block.get("lines", []): current_line_spans_text_parts = [] for span in line.get("spans", []): # Check if this span matches the majority font current_span_font_name = span["font"] current_span_font_size = span.get("size") if current_span_font_size is not None and \ current_span_font_name == majority_font_name and \ int(round(current_span_font_size)) == majority_font_size_rounded: current_line_spans_text_parts.append(span["text"]) if current_line_spans_text_parts: # Join text from selected spans within a line with a single space line_text = " ".join(current_line_spans_text_parts) current_block_lines_text_parts.append(line_text) if current_block_lines_text_parts: # Join lines within a block with a single newline block_text = "\n".join(current_block_lines_text_parts) page_blocks_text_parts.append(block_text) if page_blocks_text_parts: # Join blocks on a page with a double newline (simulating paragraph breaks) all_pages_collected_text.append("\n\n".join(page_blocks_text_parts)) if not all_pages_collected_text: print("FontFilter (Direct): No text matching the majority font was found to extract.") return "" # Join text from all pages. # A page break is already handled by the \n\n between blocks of different pages. # If more distinct page separation is needed, a custom separator could be added here. final_text = "\n\n".join(all_pages_collected_text) print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.") return final_text except Exception as e: print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}") return "" finally: if original_doc: original_doc.close() def extract_plain_text_from_original_pdf(pdf_path: str) -> str: """ Extracts raw plain text from the PDF at pdf_path without any filtering. Expects pdf_path to be a valid path to a PDF file. """ doc_orig_text = None try: doc_orig_text = fitz.open(pdf_path) full_text_parts = [page.get_text("text") for page in doc_orig_text] print(full_text_parts) return "".join(full_text_parts) except Exception as e: print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}") return "" finally: if doc_orig_text: doc_orig_text.close()