texmetrics-regex-checks-gradio-1-devtesting

Running

App Files Files Community

texmetrics-regex-checks-gradio-1-devtesting / pdf_processing.py

samyak152002

Create pdf_processing.py

961b876 verified 26 days ago

raw

history blame

3.3 kB

	# pdf_processing.py
	import fitz # PyMuPDF
	import pymupdf4llm
	import os
	import tempfile
	import traceback
	from typing import Tuple, Optional, List, Dict, Any

	def convert_rect_to_dict(rect: fitz.Rect) -> Optional[Dict[str, float]]:
	"""Converts a fitz.Rect object to a dictionary."""
	if not rect or not isinstance(rect, fitz.Rect):
	print(f"Warning: Invalid rect object received: {rect}")
	return None
	return {
	"x0": rect.x0,
	"y0": rect.y0,
	"x1": rect.x1,
	"y1": rect.y1,
	"width": rect.width,
	"height": rect.height
	}

	def try_map_issues_to_page_rects(
	issues_to_map_for_context: List[Dict[str, Any]],
	pdf_rects: List[fitz.Rect],
	page_number_for_mapping: int # 1-based page number
	) -> int:
	"""Helper function for mapping LT issues to PDF rectangles."""
	mapped_count = 0
	num_issues_to_try = len(issues_to_map_for_context)
	num_available_rects = len(pdf_rects)
	limit = min(num_issues_to_try, num_available_rects)

	for i in range(limit):
	issue_to_update = issues_to_map_for_context[i]
	if issue_to_update['is_mapped_to_pdf']: # Check the correct flag name
	continue
	pdf_rect = pdf_rects[i]
	coord_dict = convert_rect_to_dict(pdf_rect)
	if coord_dict:
	issue_to_update['pdf_coordinates_list'] = [coord_dict] # Store as list of dicts
	issue_to_update['is_mapped_to_pdf'] = True
	issue_to_update['mapped_page_number'] = page_number_for_mapping
	mapped_count += 1
	else:
	print(f" Warning: Could not convert rect for context '{issue_to_update['context_text'][:30]}...' on page {page_number_for_mapping}")
	return mapped_count

	def extract_pdf_text(file_input: Any) -> str:
	"""Extracts full text from a PDF file using PyMuPDF4LLM (as Markdown)."""
	temp_file_path_for_pymupdf4llm = None
	actual_path_to_process = None
	try:
	if isinstance(file_input, str):
	actual_path_to_process = file_input
	elif hasattr(file_input, 'read') and callable(file_input.read):
	temp_file_obj = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
	temp_file_path_for_pymupdf4llm = temp_file_obj.name
	file_input.seek(0)
	temp_file_obj.write(file_input.read())
	temp_file_obj.close()
	actual_path_to_process = temp_file_path_for_pymupdf4llm
	else:
	raise ValueError("Input 'file_input' must be a file path (str) or a file-like object.")

	doc_for_page_count = fitz.open(actual_path_to_process)
	page_count = len(doc_for_page_count)
	doc_for_page_count.close()
	print(f"PDF has {page_count} pages. Extracting Markdown using pymupdf4llm.")

	markdown_text = pymupdf4llm.to_markdown(actual_path_to_process)

	print(f"Total extracted Markdown text length: {len(markdown_text)} characters.")
	return markdown_text

	except Exception as e:
	print(f"Error extracting text from PDF: {str(e)}")
	traceback.print_exc()
	return ""
	finally:
	if temp_file_path_for_pymupdf4llm and os.path.exists(temp_file_path_for_pymupdf4llm):
	os.remove(temp_file_path_for_pymupdf4llm)