Spaces:

SFEREWQW
/

114514

Runtime error

App Files Files Community

114514 / pdf_extract_kit /utils /merge_blocks_and_spans.py

SFEREWQW

Upload 395 files

18e4106 verified 20 days ago

raw

history blame contribute delete

9.59 kB

	# revised from https://github.com/opendatalab/MinerU/blob/7f0fe20004af7416db886f4b75c116bcc1c986b4/magic_pdf/pdf_parse_union_core.py#L177
	# from fast_langdetect import detect_language
	# import unicodedata
	import re


	def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
	"""检查两个bbox在y轴上是否有重叠，并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
	_, y0_1, _, y1_1 = bbox1
	_, y0_2, _, y1_2 = bbox2

	overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
	height1, height2 = y1_1 - y0_1, y1_2 - y0_2
	max_height = max(height1, height2)
	min_height = min(height1, height2)

	return (overlap / min_height) > overlap_ratio_threshold

	def merge_spans_to_line(spans):
	if len(spans) == 0:
	return []
	else:
	# 按照y0坐标排序
	spans.sort(key=lambda span: span['bbox'][1])

	lines = []
	current_line = [spans[0]]
	for span in spans[1:]:
	# 如果当前的span类型为"isolated" 或者当前行中已经有"isolated"
	# image和table类型，同上
	if span['type'] in ['isolated'] or any(
	s['type'] in ['isolated'] for s in
	current_line):
	# 则开始新行
	lines.append(current_line)
	current_line = [span]
	continue

	# 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
	if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
	current_line.append(span)
	else:
	# 否则，开始新行
	lines.append(current_line)
	current_line = [span]

	# 添加最后一行
	if current_line:
	lines.append(current_line)

	return lines

	# 将每一个line中的span从左到右排序
	def line_sort_spans_by_left_to_right(lines):
	line_objects = []
	for line in lines:
	# 按照x0坐标排序
	line.sort(key=lambda span: span['bbox'][0])
	line_bbox = [
	min(span['bbox'][0] for span in line), # x0
	min(span['bbox'][1] for span in line), # y0
	max(span['bbox'][2] for span in line), # x1
	max(span['bbox'][3] for span in line), # y1
	]
	line_objects.append({
	"bbox": line_bbox,
	"spans": line,
	})
	return line_objects

	def fix_text_block(block):
	# 文本block中的公式span都应该转换成行内type
	for span in block['spans']:
	if span['type'] == "isolated":
	span['type'] = "inline"
	block_lines = merge_spans_to_line(block['spans'])
	sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
	block['lines'] = sort_block_lines
	del block['spans']
	return block


	def fix_interline_block(block):
	block_lines = merge_spans_to_line(block['spans'])
	sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
	block['lines'] = sort_block_lines
	del block['spans']
	return block

	def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
	"""
	计算box1和box2的重叠面积占bbox1的比例
	"""
	# Determine the coordinates of the intersection rectangle
	x_left = max(bbox1[0], bbox2[0])
	y_top = max(bbox1[1], bbox2[1])
	x_right = min(bbox1[2], bbox2[2])
	y_bottom = min(bbox1[3], bbox2[3])

	if x_right < x_left or y_bottom < y_top:
	return 0.0

	# The area of overlap area
	intersection_area = (x_right - x_left) * (y_bottom - y_top)
	bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
	if bbox1_area == 0:
	return 0
	else:
	return intersection_area / bbox1_area

	def fill_spans_in_blocks(blocks, spans, radio):
	'''
	将allspans中的span按位置关系，放入blocks中
	'''
	block_with_spans = []
	for block in blocks:
	block_type = block["category_type"]
	L = block['poly'][0]
	U = block['poly'][1]
	R = block['poly'][2]
	D = block['poly'][5]
	L, R = min(L, R), max(L, R)
	U, D = min(U, D), max(U, D)
	block_bbox = [L, U, R, D]
	block_dict = {
	'type': block_type,
	'bbox': block_bbox,
	'saved_info': block
	}
	block_spans = []
	for span in spans:
	span_bbox = span["bbox"]
	if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
	block_spans.append(span)

	'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
	# displayed_list = []
	# text_inline_lines = []
	# modify_y_axis(block_spans, displayed_list, text_inline_lines)

	'''模型识别错误的行间公式, type类型转换成行内公式'''
	# block_spans = modify_inline(block_spans, displayed_list, text_inline_lines)

	'''bbox去除粘连''' # 去粘连会影响span的bbox，导致后续fill的时候出错
	# block_spans = remove_overlap_between_bbox_for_span(block_spans)

	block_dict['spans'] = block_spans
	block_with_spans.append(block_dict)

	# 从spans删除已经放入block_spans中的span
	if len(block_spans) > 0:
	for span in block_spans:
	spans.remove(span)

	return block_with_spans, spans

	def fix_block_spans(block_with_spans):
	'''
	1、img_block和table_block因为包含caption和footnote的关系，存在block的嵌套关系
	需要将caption和footnote的text_span放入相应img_block和table_block内的
	caption_block和footnote_block中
	2、同时需要删除block中的spans字段
	'''
	fix_blocks = []
	for block in block_with_spans:
	block_type = block['type']

	# if block_type == BlockType.Image:
	# block = fix_image_block(block, img_blocks)
	# elif block_type == BlockType.Table:
	# block = fix_table_block(block, table_blocks)
	if block_type == "isolate_formula":
	block = fix_interline_block(block)
	else:
	block = fix_text_block(block)
	fix_blocks.append(block)
	return fix_blocks


	# def detect_lang(text: str) -> str:

	# if len(text) == 0:
	# return ""
	# try:
	# lang_upper = detect_language(text)
	# except:
	# html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
	# lang_upper = detect_language(html_no_ctrl_chars)
	# try:
	# lang = lang_upper.lower()
	# except:
	# lang = ""
	# return lang

	def detect_lang(string):
	"""
	检查整个字符串是否包含中文
	:param string: 需要检查的字符串
	:return: bool
	"""

	for ch in string:
	if u'\u4e00' <= ch <= u'\u9fff':
	return 'zh'
	return 'en'

	def ocr_escape_special_markdown_char(content):
	"""
	转义正文里对markdown语法有特殊意义的字符
	"""
	special_chars = ["*", "`", "~", "$"]
	for char in special_chars:
	content = content.replace(char, "\\" + char)

	return content

	# def split_long_words(text):
	# segments = text.split(' ')
	# for i in range(len(segments)):
	# words = re.findall(r'\w+\|[^\w]', segments[i], re.UNICODE)
	# for j in range(len(words)):
	# if len(words[j]) > 15:
	# words[j] = ' '.join(wordninja.split(words[j]))
	# segments[i] = ''.join(words)
	# return ' '.join(segments)


	def merge_para_with_text(para_block):
	para_text = ''
	for line in para_block['lines']:
	line_text = ""
	line_lang = ""
	for span in line['spans']:
	span_type = span['type']
	if span_type == "text":
	line_text += span['content'].strip()
	if line_text != "":
	line_lang = detect_lang(line_text)
	for span in line['spans']:
	span_type = span['type']
	content = ''
	if span_type == "text":
	content = span['content']
	content = ocr_escape_special_markdown_char(content)
	# language = detect_lang(content)
	# if language == 'en': # 只对英文长词进行分词处理，中文分词会丢失文本
	# content = ocr_escape_special_markdown_char(split_long_words(content))
	# else:
	# content = ocr_escape_special_markdown_char(content)
	elif span_type == 'inline':
	content = f" ${span['content'].strip('$')}$ "
	elif span_type == 'ignore-formula':
	content = f" ${span['content'].strip('$')}$ "
	elif span_type == 'isolated':
	content = f"\n$$\n{span['content'].strip('$')}\n$$\n"
	elif span_type == 'footnote':
	content_ori = span['content'].strip('$')
	if '^' in content_ori:
	content = f" ${content_ori}$ "
	else:
	content = f" $^{content_ori}$ "

	if content != '':
	if 'zh' in line_lang: # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
	para_text += content.strip() # 中文语境下，content间不需要空格分隔
	else:
	para_text += content.strip() + ' ' # 英文语境下 content间需要空格分隔
	return para_text