114514 / pdf_extract_kit /utils /merge_blocks_and_spans.py
SFEREWQW's picture
Upload 395 files
18e4106 verified
# revised from https://github.com/opendatalab/MinerU/blob/7f0fe20004af7416db886f4b75c116bcc1c986b4/magic_pdf/pdf_parse_union_core.py#L177
# from fast_langdetect import detect_language
# import unicodedata
import re
def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
"""检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
_, y0_1, _, y1_1 = bbox1
_, y0_2, _, y1_2 = bbox2
overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
height1, height2 = y1_1 - y0_1, y1_2 - y0_2
max_height = max(height1, height2)
min_height = min(height1, height2)
return (overlap / min_height) > overlap_ratio_threshold
def merge_spans_to_line(spans):
if len(spans) == 0:
return []
else:
# 按照y0坐标排序
spans.sort(key=lambda span: span['bbox'][1])
lines = []
current_line = [spans[0]]
for span in spans[1:]:
# 如果当前的span类型为"isolated" 或者 当前行中已经有"isolated"
# image和table类型,同上
if span['type'] in ['isolated'] or any(
s['type'] in ['isolated'] for s in
current_line):
# 则开始新行
lines.append(current_line)
current_line = [span]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
current_line.append(span)
else:
# 否则,开始新行
lines.append(current_line)
current_line = [span]
# 添加最后一行
if current_line:
lines.append(current_line)
return lines
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
for line in lines:
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
]
line_objects.append({
"bbox": line_bbox,
"spans": line,
})
return line_objects
def fix_text_block(block):
# 文本block中的公式span都应该转换成行内type
for span in block['spans']:
if span['type'] == "isolated":
span['type'] = "inline"
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
def fix_interline_block(block):
block_lines = merge_spans_to_line(block['spans'])
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block['lines'] = sort_block_lines
del block['spans']
return block
def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
"""
计算box1和box2的重叠面积占bbox1的比例
"""
# Determine the coordinates of the intersection rectangle
x_left = max(bbox1[0], bbox2[0])
y_top = max(bbox1[1], bbox2[1])
x_right = min(bbox1[2], bbox2[2])
y_bottom = min(bbox1[3], bbox2[3])
if x_right < x_left or y_bottom < y_top:
return 0.0
# The area of overlap area
intersection_area = (x_right - x_left) * (y_bottom - y_top)
bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
if bbox1_area == 0:
return 0
else:
return intersection_area / bbox1_area
def fill_spans_in_blocks(blocks, spans, radio):
'''
将allspans中的span按位置关系,放入blocks中
'''
block_with_spans = []
for block in blocks:
block_type = block["category_type"]
L = block['poly'][0]
U = block['poly'][1]
R = block['poly'][2]
D = block['poly'][5]
L, R = min(L, R), max(L, R)
U, D = min(U, D), max(U, D)
block_bbox = [L, U, R, D]
block_dict = {
'type': block_type,
'bbox': block_bbox,
'saved_info': block
}
block_spans = []
for span in spans:
span_bbox = span["bbox"]
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
# displayed_list = []
# text_inline_lines = []
# modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式'''
# block_spans = modify_inline(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
block_dict['spans'] = block_spans
block_with_spans.append(block_dict)
# 从spans删除已经放入block_spans中的span
if len(block_spans) > 0:
for span in block_spans:
spans.remove(span)
return block_with_spans, spans
def fix_block_spans(block_with_spans):
'''
1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中
2、同时需要删除block中的spans字段
'''
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
# if block_type == BlockType.Image:
# block = fix_image_block(block, img_blocks)
# elif block_type == BlockType.Table:
# block = fix_table_block(block, table_blocks)
if block_type == "isolate_formula":
block = fix_interline_block(block)
else:
block = fix_text_block(block)
fix_blocks.append(block)
return fix_blocks
# def detect_lang(text: str) -> str:
# if len(text) == 0:
# return ""
# try:
# lang_upper = detect_language(text)
# except:
# html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
# lang_upper = detect_language(html_no_ctrl_chars)
# try:
# lang = lang_upper.lower()
# except:
# lang = ""
# return lang
def detect_lang(string):
"""
检查整个字符串是否包含中文
:param string: 需要检查的字符串
:return: bool
"""
for ch in string:
if u'\u4e00' <= ch <= u'\u9fff':
return 'zh'
return 'en'
def ocr_escape_special_markdown_char(content):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars = ["*", "`", "~", "$"]
for char in special_chars:
content = content.replace(char, "\\" + char)
return content
# def split_long_words(text):
# segments = text.split(' ')
# for i in range(len(segments)):
# words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
# for j in range(len(words)):
# if len(words[j]) > 15:
# words[j] = ' '.join(wordninja.split(words[j]))
# segments[i] = ''.join(words)
# return ' '.join(segments)
def merge_para_with_text(para_block):
para_text = ''
for line in para_block['lines']:
line_text = ""
line_lang = ""
for span in line['spans']:
span_type = span['type']
if span_type == "text":
line_text += span['content'].strip()
if line_text != "":
line_lang = detect_lang(line_text)
for span in line['spans']:
span_type = span['type']
content = ''
if span_type == "text":
content = span['content']
content = ocr_escape_special_markdown_char(content)
# language = detect_lang(content)
# if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
# content = ocr_escape_special_markdown_char(split_long_words(content))
# else:
# content = ocr_escape_special_markdown_char(content)
elif span_type == 'inline':
content = f" ${span['content'].strip('$')}$ "
elif span_type == 'ignore-formula':
content = f" ${span['content'].strip('$')}$ "
elif span_type == 'isolated':
content = f"\n$$\n{span['content'].strip('$')}\n$$\n"
elif span_type == 'footnote':
content_ori = span['content'].strip('$')
if '^' in content_ori:
content = f" ${content_ori}$ "
else:
content = f" $^{content_ori}$ "
if content != '':
if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
para_text += content.strip() # 中文语境下,content间不需要空格分隔
else:
para_text += content.strip() + ' ' # 英文语境下 content间需要空格分隔
return para_text