File size: 8,387 Bytes
961b876 0918bea eb20090 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 0918bea 961b876 2c6cadb eb20090 2c6cadb eb20090 961b876 2c6cadb 0918bea eb20090 2c6cadb eb20090 0918bea 2c6cadb eb20090 2c6cadb eb20090 2c6cadb eb20090 2c6cadb 0918bea 2c6cadb 0918bea 2c6cadb eb20090 0918bea 2c6cadb 0918bea 961b876 0918bea 961b876 0918bea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# pdf_processing.py
import fitz # PyMuPDF
import pymupdf4llm
import os
import traceback
from typing import Any, Dict, List # Use standard List, Dict
from collections import Counter
def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None:
"""Converts a fitz.Rect object to a dictionary."""
if not rect or not isinstance(rect, fitz.Rect):
# print(f"Warning: Invalid rect object received: {rect}") # Can be verbose
return None
return {
"x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1,
"width": rect.width, "height": rect.height
}
def try_map_issues_to_page_rects(
issues_to_map_for_context: List[Dict[str, Any]],
pdf_rects: List[fitz.Rect],
page_number_for_mapping: int
) -> int:
mapped_count = 0
limit = min(len(issues_to_map_for_context), len(pdf_rects))
for i in range(limit):
issue_to_update = issues_to_map_for_context[i]
if issue_to_update['is_mapped_to_pdf']: continue
pdf_rect = pdf_rects[i]
coord_dict = convert_rect_to_dict(pdf_rect)
if coord_dict:
issue_to_update['pdf_coordinates_list'] = [coord_dict]
issue_to_update['is_mapped_to_pdf'] = True
issue_to_update['mapped_page_number'] = page_number_for_mapping
mapped_count += 1
return mapped_count
import fitz # PyMuPDF
import os
import traceback
from typing import Any, Dict, List
from collections import Counter
# Assuming your helper functions (convert_rect_to_dict, etc.) are present if needed elsewhere.
import fitz # PyMuPDF
import os
import traceback
from typing import Any, Dict, List # Use standard List, Dict
from collections import Counter
# Assuming your other helper functions (convert_rect_to_dict, etc.) are in the same scope if needed by other parts of your code.
def extract_majority_font_text_directly(pdf_path: str) -> str:
"""
Extracts text from PDF, identifies the majority font and size,
and then directly assembles a plain text string containing only the text
that matches this majority font, attempting to preserve basic structure.
This method does NOT create an intermediate PDF document.
"""
original_doc = None
try:
# 1. Open PDF and Perform Font Analysis (similar to before)
original_doc = fitz.open(pdf_path)
if not original_doc.page_count:
print("FontFilter (Direct): PDF has no pages.")
return ""
font_char_counts: Counter = Counter()
pdf_basename = os.path.basename(pdf_path)
print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")
# First pass: Analyze fonts to find the majority
for page_num_analysis in range(original_doc.page_count):
page_analysis = original_doc[page_num_analysis]
# Using TEXTFLAGS_TEXT for potentially cleaner text from spans
text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)
for block_analysis in text_dict_analysis.get("blocks", []):
if block_analysis.get("type") == 0: # type 0 is a text block
for line_analysis in block_analysis.get("lines", []):
for span_analysis in line_analysis.get("spans", []):
font_name = span_analysis["font"]
font_size = span_analysis.get("size")
if font_size is None: continue # Skip if size is not available
font_size_rounded = int(round(font_size))
text = span_analysis["text"]
if not text.strip(): continue # Skip purely whitespace spans
font_char_counts[(font_name, font_size_rounded)] += len(text)
if not font_char_counts:
print("FontFilter (Direct): No text with font information found in PDF.")
return ""
majority_font_tuple_info = font_char_counts.most_common(1)[0]
(majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
char_count_for_majority = majority_font_tuple_info[1]
print(
f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).")
# 2. Second Pass: Extract and Assemble Text Based on Majority Font
print(
f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...")
all_pages_collected_text = [] # List to hold text from each page (as a list of block texts)
for page_num_extraction in range(original_doc.page_count):
page = original_doc[page_num_extraction]
# Using flags for potentially better whitespace and ligature handling in extracted text
text_page_dict = page.get_text("dict",
flags=fitz.TEXTFLAGS_TEXT | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
page_blocks_text_parts = [] # Collect text from blocks on this page
for block in text_page_dict.get("blocks", []):
if block.get("type") == 0: # Text block
current_block_lines_text_parts = []
for line in block.get("lines", []):
current_line_spans_text_parts = []
for span in line.get("spans", []):
# Check if this span matches the majority font
current_span_font_name = span["font"]
current_span_font_size = span.get("size")
if current_span_font_size is not None and \
current_span_font_name == majority_font_name and \
int(round(current_span_font_size)) == majority_font_size_rounded:
current_line_spans_text_parts.append(span["text"])
if current_line_spans_text_parts:
# Join text from selected spans within a line with a single space
line_text = " ".join(current_line_spans_text_parts)
current_block_lines_text_parts.append(line_text)
if current_block_lines_text_parts:
# Join lines within a block with a single newline
block_text = "\n".join(current_block_lines_text_parts)
page_blocks_text_parts.append(block_text)
if page_blocks_text_parts:
# Join blocks on a page with a double newline (simulating paragraph breaks)
all_pages_collected_text.append("\n\n".join(page_blocks_text_parts))
if not all_pages_collected_text:
print("FontFilter (Direct): No text matching the majority font was found to extract.")
return ""
# Join text from all pages.
# A page break is already handled by the \n\n between blocks of different pages.
# If more distinct page separation is needed, a custom separator could be added here.
final_text = "\n\n".join(all_pages_collected_text)
print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.")
return final_text
except Exception as e:
print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}")
return ""
finally:
if original_doc: original_doc.close()
def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
"""
Extracts raw plain text from the PDF at pdf_path without any filtering.
Expects pdf_path to be a valid path to a PDF file.
"""
doc_orig_text = None
try:
doc_orig_text = fitz.open(pdf_path)
full_text_parts = [page.get_text("text") for page in doc_orig_text]
print(full_text_parts)
return "".join(full_text_parts)
except Exception as e:
print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
return ""
finally:
if doc_orig_text: doc_orig_text.close() |