# pdf_processing.py import fitz # PyMuPDF import pymupdf4llm import os import traceback from typing import Any, Dict, List # Use standard List, Dict from collections import Counter def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None: """Converts a fitz.Rect object to a dictionary.""" if not rect or not isinstance(rect, fitz.Rect): # print(f"Warning: Invalid rect object received: {rect}") # Can be verbose return None return { "x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1, "width": rect.width, "height": rect.height } def try_map_issues_to_page_rects( issues_to_map_for_context: List[Dict[str, Any]], pdf_rects: List[fitz.Rect], page_number_for_mapping: int ) -> int: mapped_count = 0 limit = min(len(issues_to_map_for_context), len(pdf_rects)) for i in range(limit): issue_to_update = issues_to_map_for_context[i] if issue_to_update['is_mapped_to_pdf']: continue pdf_rect = pdf_rects[i] coord_dict = convert_rect_to_dict(pdf_rect) if coord_dict: issue_to_update['pdf_coordinates_list'] = [coord_dict] issue_to_update['is_mapped_to_pdf'] = True issue_to_update['mapped_page_number'] = page_number_for_mapping mapped_count += 1 return mapped_count def extract_font_filtered_markdown(pdf_path: str) -> str: """ Extracts text from PDF at pdf_path, filters by majority font, builds a new PDF in memory, and converts it to Markdown using PyMuPDF4LLM. Expects pdf_path to be a valid path to a PDF file. """ original_doc = None new_doc = None try: original_doc = fitz.open(pdf_path) if not original_doc.page_count: print("FontFilter: PDF has no pages.") return "" all_spans_details: List[Dict[str, Any]] = [] font_char_counts: Counter = Counter() pdf_basename = os.path.basename(pdf_path) print(f"FontFilter: Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...") for page_num in range(original_doc.page_count): page = original_doc[page_num] text_dict = page.get_text("dict") for block in text_dict.get("blocks", []): if block.get("type") == 0: for line in block.get("lines", []): for span in line.get("spans", []): font_name = span["font"] font_size_rounded = int(round(span["size"])) text = span["text"] span_detail = { "text": text, "font_name": font_name, "font_size_rounded": font_size_rounded, "original_font_size": span["size"], "bbox": span["bbox"], "page_num": page_num } all_spans_details.append(span_detail) font_char_counts[(font_name, font_size_rounded)] += len(text) if not font_char_counts: print("FontFilter: No text with font information found in PDF.") return "" majority_font_tuple_info = font_char_counts.most_common(1)[0] (majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0] char_count = majority_font_tuple_info[1] print(f"FontFilter: Majority font: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count} chars).") new_doc = fitz.Document() # print("FontFilter: Constructing new PDF with majority font text...") # Can be verbose for p_num in range(original_doc.page_count): original_page_for_dim = original_doc[p_num] new_pdf_page = new_doc.new_page(width=original_page_for_dim.rect.width, height=original_page_for_dim.rect.height) spans_to_write = [ s_detail for s_detail in all_spans_details if s_detail["page_num"] == p_num and \ s_detail["font_name"] == majority_font_name and \ s_detail["font_size_rounded"] == majority_font_size_rounded ] for span_data in spans_to_write: text_to_insert = span_data["text"] original_bbox = fitz.Rect(span_data["bbox"]) font_size_for_render = span_data["original_font_size"] new_pdf_page.insert_textbox( original_bbox, text_to_insert, fontsize=font_size_for_render, fontname="helv", align=0 ) # Ignoring insertion_result for brevity here # print(f"FontFilter: New PDF constructed with {new_doc.page_count} pages.") markdown_text = "" if new_doc.page_count > 0: # print(f"FontFilter: Converting filtered PDF Document object to Markdown...") # Verbose markdown_text = pymupdf4llm.to_markdown(new_doc) else: print("FontFilter: The new PDF (filtered) is empty. No markdown generated.") # print(f"FontFilter: Markdown from filtered PDF length: {len(markdown_text)} chars.") return markdown_text except Exception as e: print(f"Error in extract_font_filtered_markdown for '{pdf_path}': {e}\n{traceback.format_exc()}") return "" finally: if original_doc: original_doc.close() if new_doc: new_doc.close() def extract_plain_text_from_original_pdf(pdf_path: str) -> str: """ Extracts raw plain text from the PDF at pdf_path without any filtering. Expects pdf_path to be a valid path to a PDF file. """ doc_orig_text = None try: doc_orig_text = fitz.open(pdf_path) full_text_parts = [page.get_text("text") for page in doc_orig_text] # print(f"OriginalTextExtract: Extracted {len(doc_orig_text.page_count)} pages of plain text from '{os.path.basename(pdf_path)}'.") return "".join(full_text_parts) except Exception as e: print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}") return "" finally: if doc_orig_text: doc_orig_text.close()