texmetrics-regex-checks-gradio-1-devtesting

Running

App Files Files Community

texmetrics-regex-checks-gradio-1-devtesting / pdf_processing.py

samyak152002

Update pdf_processing.py (#2)

fee8cba verified 21 days ago

raw

history blame contribute delete

12.9 kB

	# pdf_processing.py
	import fitz # PyMuPDF
	import pymupdf4llm
	import os
	import traceback
	from typing import Any, Dict, List, Optional # Use standard List, Dict, Optional
	from collections import Counter

	def convert_rect_to_dict(rect: fitz.Rect) -> Dict[str, float] \| None:
	"""Converts a fitz.Rect object to a dictionary."""
	if not rect or not isinstance(rect, fitz.Rect):
	# print(f"Warning: Invalid rect object received: {rect}") # Can be verbose
	return None
	return {
	"x0": rect.x0, "y0": rect.y0, "x1": rect.x1, "y1": rect.y1,
	"width": rect.width, "height": rect.height
	}

	def _get_specific_error_rect_in_context(
	page: fitz.Page,
	context_rect: fitz.Rect,
	error_text_verbatim: str
	) -> Optional[fitz.Rect]:
	"""
	Tries to find the precise bounding box of error_text_verbatim within
	the larger context_rect on the given page.
	"""
	if not error_text_verbatim or error_text_verbatim.isspace():
	print(f"Debug: _get_specific_error_rect_in_context: error_text_verbatim is empty or whitespace.")
	return None

	# Extract words sorted by position within the given context_rect
	# Each word_data is (x0, y0, x1, y1, "text", block_no, line_no, word_no)
	words_on_page_in_clip = page.get_text("words", clip=context_rect, sort=True)

	# print(f"Debug: _get_specific_error_rect_in_context: Searching for '{error_text_verbatim}' in {len(words_on_page_in_clip)} words within clip {context_rect}")

	error_tokens = error_text_verbatim.strip().split()
	if not error_tokens:
	print(f"Debug: _get_specific_error_rect_in_context: No tokens from error_text_verbatim '{error_text_verbatim}'.")
	return None

	found_rects_for_error_sequence = []

	for i in range(len(words_on_page_in_clip) - len(error_tokens) + 1):
	match = True
	current_sequence_rects = []
	# print(f"Debug: _get_specific_error_rect_in_context: Trying match starting at PDF word '{words_on_page_in_clip[i][4]}'")
	for j in range(len(error_tokens)):
	pdf_word_text = words_on_page_in_clip[i+j][4]
	error_token_to_match = error_tokens[j]

	# Basic normalization for comparison
	pdf_word_normalized = pdf_word_text.strip().lower()
	error_token_normalized = error_token_to_match.strip().lower()

	# A more robust comparison might involve removing common punctuation
	# or handling hyphenation if LanguageTool splits differently than PyMuPDF.
	if error_token_normalized != pdf_word_normalized:
	# print(f"Debug: _get_specific_error_rect_in_context: Mismatch: '{error_token_normalized}' (expected) vs '{pdf_word_normalized}' (pdf word)")
	match = False
	break
	current_sequence_rects.append(fitz.Rect(words_on_page_in_clip[i+j][:4]))

	if match:
	# print(f"Debug: _get_specific_error_rect_in_context: Found match for '{error_text_verbatim}'")
	found_rects_for_error_sequence = current_sequence_rects
	break # Found the first full match of the error_text_verbatim

	if found_rects_for_error_sequence:
	final_error_bbox = fitz.Rect() # Start with an empty rect
	for r_part in found_rects_for_error_sequence:
	final_error_bbox.include_rect(r_part) # Expand to include this part

	if not final_error_bbox.is_empty:
	# print(f"Debug: _get_specific_error_rect_in_context: Combined bbox: {final_error_bbox}")
	return final_error_bbox
	else:
	# print(f"Debug: _get_specific_error_rect_in_context: Combined bbox was empty.")
	pass
	else:
	# print(f"Debug: _get_specific_error_rect_in_context: No match found for '{error_text_verbatim}'.")
	pass
	return None


	def try_map_issues_to_page_rects(
	issues_to_map_for_context: List[Dict[str, Any]],
	pdf_rects_from_search: List[fitz.Rect], # Rects for occurrences of the wider context string
	page_number_for_mapping: int,
	page: fitz.Page # The current PyMuPDF page object
	) -> int:
	mapped_count = 0
	# We assume that the number of issues for a given context string on a page
	# should not exceed the number of times that context string appears.
	# If it does, we only map up to the number of found context occurrences.
	limit = min(len(issues_to_map_for_context), len(pdf_rects_from_search))

	for i in range(limit):
	issue_to_update = issues_to_map_for_context[i]
	if issue_to_update['is_mapped_to_pdf']:
	continue

	# This is the rectangle for the i-th occurrence of the wider context string
	context_occurrence_rect = pdf_rects_from_search[i]

	final_rect_for_issue = context_occurrence_rect # Default to the whole context rect

	# For LanguageTool issues, try to refine the rect to the specific error text
	if issue_to_update.get('source_check_type') == 'LanguageTool':
	error_text_verbatim = issue_to_update.get('error_text_verbatim')
	if error_text_verbatim:
	# print(f"Debug: Refining LT issue: '{error_text_verbatim}' within context rect {context_occurrence_rect}")
	specific_error_rect = _get_specific_error_rect_in_context(
	page, context_occurrence_rect, error_text_verbatim
	)
	if specific_error_rect:
	final_rect_for_issue = specific_error_rect
	# print(f"Debug: Refined rect to: {final_rect_for_issue}")
	else:
	# print(f"Debug: Could not refine rect, using context rect: {context_occurrence_rect}")
	pass # Stick with the wider context_occurrence_rect if specific not found

	coord_dict = convert_rect_to_dict(final_rect_for_issue)
	if coord_dict:
	issue_to_update['pdf_coordinates_list'] = [coord_dict]
	issue_to_update['is_mapped_to_pdf'] = True
	issue_to_update['mapped_page_number'] = page_number_for_mapping
	mapped_count += 1
	return mapped_count


	# ... (rest of pdf_processing.py, including extract_majority_font_text_directly and extract_plain_text_from_original_pdf) ...

	def extract_majority_font_text_directly(pdf_path: str) -> str:
	"""
	Extracts text from PDF, identifies the majority font and size,
	and then directly assembles a plain text string containing only the text
	that matches this majority font, attempting to preserve basic structure.
	This method does NOT create an intermediate PDF document.
	"""
	original_doc = None
	try:
	# 1. Open PDF and Perform Font Analysis (similar to before)
	original_doc = fitz.open(pdf_path)
	if not original_doc.page_count:
	print("FontFilter (Direct): PDF has no pages.")
	return ""

	font_char_counts: Counter = Counter()
	pdf_basename = os.path.basename(pdf_path)
	print(f"FontFilter (Direct): Analyzing fonts in '{pdf_basename}' ({original_doc.page_count} pages)...")

	# First pass: Analyze fonts to find the majority
	for page_num_analysis in range(original_doc.page_count):
	page_analysis = original_doc[page_num_analysis]
	# Using TEXTFLAGS_TEXT for potentially cleaner text from spans
	text_dict_analysis = page_analysis.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)
	for block_analysis in text_dict_analysis.get("blocks", []):
	if block_analysis.get("type") == 0: # type 0 is a text block
	for line_analysis in block_analysis.get("lines", []):
	for span_analysis in line_analysis.get("spans", []):
	font_name = span_analysis["font"]
	font_size = span_analysis.get("size")
	if font_size is None: continue # Skip if size is not available

	font_size_rounded = int(round(font_size))
	text = span_analysis["text"]
	if not text.strip(): continue # Skip purely whitespace spans

	font_char_counts[(font_name, font_size_rounded)] += len(text)

	if not font_char_counts:
	print("FontFilter (Direct): No text with font information found in PDF.")
	return ""

	majority_font_tuple_info = font_char_counts.most_common(1)[0]
	(majority_font_name, majority_font_size_rounded) = majority_font_tuple_info[0]
	char_count_for_majority = majority_font_tuple_info[1]
	print(
	f"FontFilter (Direct): Majority font identified: Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt ({char_count_for_majority} chars).")

	# 2. Second Pass: Extract and Assemble Text Based on Majority Font
	print(
	f"FontFilter (Direct): Extracting text matching majority font (Name='{majority_font_name}', RoundedSize={majority_font_size_rounded}pt)...")
	all_pages_collected_text = [] # List to hold text from each page (as a list of block texts)

	for page_num_extraction in range(original_doc.page_count):
	page = original_doc[page_num_extraction]
	# Using flags for potentially better whitespace and ligature handling in extracted text
	text_page_dict = page.get_text("dict",
	flags=fitz.TEXTFLAGS_TEXT \| fitz.TEXT_PRESERVE_LIGATURES \| fitz.TEXT_PRESERVE_WHITESPACE)

	page_blocks_text_parts = [] # Collect text from blocks on this page

	for block in text_page_dict.get("blocks", []):
	if block.get("type") == 0: # Text block
	current_block_lines_text_parts = []
	for line in block.get("lines", []):
	current_line_spans_text_parts = []
	for span in line.get("spans", []):
	# Check if this span matches the majority font
	current_span_font_name = span["font"]
	current_span_font_size = span.get("size")

	if current_span_font_size is not None and \
	current_span_font_name == majority_font_name and \
	int(round(current_span_font_size)) == majority_font_size_rounded:
	current_line_spans_text_parts.append(span["text"])

	if current_line_spans_text_parts:
	# Join text from selected spans within a line with a single space
	line_text = " ".join(current_line_spans_text_parts)
	current_block_lines_text_parts.append(line_text)

	if current_block_lines_text_parts:
	# Join lines within a block with a single newline
	block_text = "\n".join(current_block_lines_text_parts)
	page_blocks_text_parts.append(block_text)

	if page_blocks_text_parts:
	# Join blocks on a page with a double newline (simulating paragraph breaks)
	all_pages_collected_text.append("\n\n".join(page_blocks_text_parts))

	if not all_pages_collected_text:
	print("FontFilter (Direct): No text matching the majority font was found to extract.")
	return ""

	# Join text from all pages.
	# A page break is already handled by the \n\n between blocks of different pages.
	# If more distinct page separation is needed, a custom separator could be added here.
	final_text = "\n\n".join(all_pages_collected_text)
	print(f"FontFilter (Direct): Successfully extracted text. Total length: {len(final_text)} characters.")
	return final_text

	except Exception as e:
	print(f"Error in extract_majority_font_text_directly for '{pdf_path}': {e}\n{traceback.format_exc()}")
	return ""
	finally:
	if original_doc: original_doc.close()


	def extract_plain_text_from_original_pdf(pdf_path: str) -> str:
	"""
	Extracts raw plain text from the PDF at pdf_path without any filtering.
	Expects pdf_path to be a valid path to a PDF file.
	"""
	doc_orig_text = None
	try:
	doc_orig_text = fitz.open(pdf_path)
	full_text_parts = [page.get_text("text") for page in doc_orig_text]
	# print(full_text_parts) # This was the user's debug print, can be noisy
	return "".join(full_text_parts)
	except Exception as e:
	print(f"Error extracting plain text from original PDF '{pdf_path}': {e}\n{traceback.format_exc()}")
	return ""
	finally:
	if doc_orig_text: doc_orig_text.close()