|
|
|
import fitz |
|
import pymupdf4llm |
|
import os |
|
import traceback |
|
from typing import Any, Dict, List, Optional |
|
from collections import Counter |
|
|
|
def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] | None: |
|
"""Converts a fitz.Rect object to a dictionary.""" |
|
if not rect or not isinstance(rect, fitz.Rect): |
|
|
|
return None |
|
return { |
|
"x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1, |
|
"width": rect.width, "height": rect.height |
|
} |
|
|
|
def _get_specific_error_rect_in_context( |
|
page: fitz.Page, |
|
context_rect: fitz.Rect, |
|
error_text_verbatim: str |
|
) -> Optional[fitz.Rect]: |
|
""" |
|
Tries to find the precise bounding box of error_text_verbatim within |
|
the larger context_rect on the given page. |
|
""" |
|
if not error_text_verbatim or error_text_verbatim.isspace(): |
|
print(f"Debug: _get_specific_error_rect_in_context: error_text_verbatim is empty or whitespace.") |
|
return None |
|
|
|
|
|
|
|
words_on_page_in_clip = page.get_text("words", clip=context_rect, sort=True) |
|
|
|
|
|
|
|
error_tokens = error_text_verbatim.strip().split() |
|
if not error_tokens: |
|
print(f"Debug: _get_specific_error_rect_in_context: No tokens from error_text_verbatim '{error_text_verbatim}'.") |
|
return None |
|
|
|
found_rects_for_error_sequence = [] |
|
|
|
for i in range(len(words_on_page_in_clip) - len(error_tokens) + 1): |
|
match = True |
|
current_sequence_rects = [] |
|
|
|
for j in range(len(error_tokens)): |
|
pdf_word_text = words_on_page_in_clip[i+j][4] |
|
error_token_to_match = error_tokens[j] |
|
|
|
|
|
pdf_word_normalized = pdf_word_text.strip().lower() |
|
error_token_normalized = error_token_to_match.strip().lower() |
|
|
|
|
|
|
|
if error_token_normalized != pdf_word_normalized: |
|
|
|
match = False |
|
break |
|
current_sequence_rects.append(fitz.Rect(words_on_page_in_clip[i+j][:4])) |
|
|
|
if match: |
|
|
|
found_rects_for_error_sequence = current_sequence_rects |
|
break |
|
|
|
if found_rects_for_error_sequence: |
|
final_error_bbox = fitz.Rect() |
|
for r_part in found_rects_for_error_sequence: |
|
final_error_bbox.include_rect(r_part) |
|
|
|
if not final_error_bbox.is_empty: |
|
|
|
return final_error_bbox |
|
else: |
|
|
|
pass |
|
else: |
|
|
|
pass |
|
return None |
|
|
|
|
|
def try_map_issues_to_page_rects( |
|
issues_to_map_for_context: List[Dict[str, Any]], |
|
pdf_rects_from_search: List[fitz.Rect], |
|
page_number_for_mapping: int, |
|
page: fitz.Page |
|
) -> int: |
|
mapped_count = 0 |
|
|
|
|
|
|
|
limit = min(len(issues_to_map_for_context), len(pdf_rects_from_search)) |
|
|
|
for i in range(limit): |
|
issue_to_update = issues_to_map_for_context[i] |
|
if issue_to_update['is_mapped_to_pdf']: |
|
continue |
|
|
|
|
|
context_occurrence_rect = pdf_rects_from_search[i] |
|
|
|
final_rect_for_issue = context_occurrence_rect |
|
|
|
|
|
if issue_to_update.get('source_check_type') == 'LanguageTool': |
|
error_text_verbatim = issue_to_update.get('error_text_verbatim') |
|
if error_text_verbatim: |
|
|
|
specific_error_rect = _get_specific_error_rect_in_context( |
|
page, context_occurrence_rect, error_text_verbatim |
|
) |
|
if specific_error_rect: |
|
final_rect_for_issue = specific_error_rect |
|
|
|
else: |
|
|
|
pass |
|
|
|
coord_dict = convert_rect_to_dict(final_rect_for_issue) |
|
if coord_dict: |
|
issue_to_update['pdf_coordinates_list'] = [coord_dict] |
|
issue_to_update['is_mapped_to_pdf'] = True |
|
issue_to_update['mapped_page_number'] = page_number_for_mapping |
|
mapped_count += 1 |
|
return mapped_count |
|
|
|
|
|
|
|
|
|
def extract_majority_font_text_directly(pdf_path: str) -> str: |
|
""" |
|
Extracts text from PDF, identifies the majority font and size, |
|
and then directly assembles a plain text string containing only the text |
|
that matches this majority font, attempting to preserve basic structure. |
|
This method does NOT create an intermediate PDF document. |
|
""" |
|
original_doc = None |
|
try: |
|
|
|
original_doc = fitz.open(pdf_path) |
|
if not original_doc.page_count: |
|
print("FontFilter (Direct): PDF has no pages.") |
|
return "" |
|
|
|
font_char_counts: Counter = Counter() |
|
pdf_basename = os.path.basename(pdf_path) |
|
print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...") |
|
|
|
|
|
for page_num_analysis in range(original_doc.page_count): |
|
page_analysis = original_doc[page_num_analysis] |
|
|
|
text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT) |
|
for block_analysis in text_dict_analysis.get("blocks", []): |
|
if block_analysis.get("type") == 0: |
|
for line_analysis in block_analysis.get("lines", []): |
|
for span_analysis in line_analysis.get("spans", []): |
|
font_name = span_analysis["font"] |
|
font_size = span_analysis.get("size") |
|
if font_size is None: continue |
|
|
|
font_size_rounded = int(round(font_size)) |
|
text = span_analysis["text"] |
|
if not text.strip(): continue |
|
|
|
font_char_counts[(font_name, font_size_rounded)] += len(text) |
|
|
|
if not font_char_counts: |
|
print("FontFilter (Direct): No text with font information found in PDF.") |
|
return "" |
|
|
|
majority_font_tuple_info = font_char_counts.most_common(1)[0] |
|
(majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0] |
|
char_count_for_majority = majority_font_tuple_info[1] |
|
print( |
|
f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).") |
|
|
|
|
|
print( |
|
f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...") |
|
all_pages_collected_text = [] |
|
|
|
for page_num_extraction in range(original_doc.page_count): |
|
page = original_doc[page_num_extraction] |
|
|
|
text_page_dict = page.get_text("dict", |
|
flags=fitz.TEXTFLAGS_TEXT | fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE) |
|
|
|
page_blocks_text_parts = [] |
|
|
|
for block in text_page_dict.get("blocks", []): |
|
if block.get("type") == 0: |
|
current_block_lines_text_parts = [] |
|
for line in block.get("lines", []): |
|
current_line_spans_text_parts = [] |
|
for span in line.get("spans", []): |
|
|
|
current_span_font_name = span["font"] |
|
current_span_font_size = span.get("size") |
|
|
|
if current_span_font_size is not None and \ |
|
current_span_font_name == majority_font_name and \ |
|
int(round(current_span_font_size)) == majority_font_size_rounded: |
|
current_line_spans_text_parts.append(span["text"]) |
|
|
|
if current_line_spans_text_parts: |
|
|
|
line_text = " ".join(current_line_spans_text_parts) |
|
current_block_lines_text_parts.append(line_text) |
|
|
|
if current_block_lines_text_parts: |
|
|
|
block_text = "\n".join(current_block_lines_text_parts) |
|
page_blocks_text_parts.append(block_text) |
|
|
|
if page_blocks_text_parts: |
|
|
|
all_pages_collected_text.append("\n\n".join(page_blocks_text_parts)) |
|
|
|
if not all_pages_collected_text: |
|
print("FontFilter (Direct): No text matching the majority font was found to extract.") |
|
return "" |
|
|
|
|
|
|
|
|
|
final_text = "\n\n".join(all_pages_collected_text) |
|
print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.") |
|
return final_text |
|
|
|
except Exception as e: |
|
print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}") |
|
return "" |
|
finally: |
|
if original_doc: original_doc.close() |
|
|
|
|
|
def extract_plain_text_from_original_pdf(pdf_path: str) -> str: |
|
""" |
|
Extracts raw plain text from the PDF at pdf_path without any filtering. |
|
Expects pdf_path to be a valid path to a PDF file. |
|
""" |
|
doc_orig_text = None |
|
try: |
|
doc_orig_text = fitz.open(pdf_path) |
|
full_text_parts = [page.get_text("text") for page in doc_orig_text] |
|
|
|
return "".join(full_text_parts) |
|
except Exception as e: |
|
print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}") |
|
return "" |
|
finally: |
|
if doc_orig_text: doc_orig_text.close() |