|
|
|
|
|
|
|
import re |
|
|
|
|
|
def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8): |
|
"""检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%""" |
|
_, y0_1, _, y1_1 = bbox1 |
|
_, y0_2, _, y1_2 = bbox2 |
|
|
|
overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2)) |
|
height1, height2 = y1_1 - y0_1, y1_2 - y0_2 |
|
max_height = max(height1, height2) |
|
min_height = min(height1, height2) |
|
|
|
return (overlap / min_height) > overlap_ratio_threshold |
|
|
|
def merge_spans_to_line(spans): |
|
if len(spans) == 0: |
|
return [] |
|
else: |
|
|
|
spans.sort(key=lambda span: span['bbox'][1]) |
|
|
|
lines = [] |
|
current_line = [spans[0]] |
|
for span in spans[1:]: |
|
|
|
|
|
if span['type'] in ['isolated'] or any( |
|
s['type'] in ['isolated'] for s in |
|
current_line): |
|
|
|
lines.append(current_line) |
|
current_line = [span] |
|
continue |
|
|
|
|
|
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']): |
|
current_line.append(span) |
|
else: |
|
|
|
lines.append(current_line) |
|
current_line = [span] |
|
|
|
|
|
if current_line: |
|
lines.append(current_line) |
|
|
|
return lines |
|
|
|
|
|
def line_sort_spans_by_left_to_right(lines): |
|
line_objects = [] |
|
for line in lines: |
|
|
|
line.sort(key=lambda span: span['bbox'][0]) |
|
line_bbox = [ |
|
min(span['bbox'][0] for span in line), |
|
min(span['bbox'][1] for span in line), |
|
max(span['bbox'][2] for span in line), |
|
max(span['bbox'][3] for span in line), |
|
] |
|
line_objects.append({ |
|
"bbox": line_bbox, |
|
"spans": line, |
|
}) |
|
return line_objects |
|
|
|
def fix_text_block(block): |
|
|
|
for span in block['spans']: |
|
if span['type'] == "isolated": |
|
span['type'] = "inline" |
|
block_lines = merge_spans_to_line(block['spans']) |
|
sort_block_lines = line_sort_spans_by_left_to_right(block_lines) |
|
block['lines'] = sort_block_lines |
|
del block['spans'] |
|
return block |
|
|
|
|
|
def fix_interline_block(block): |
|
block_lines = merge_spans_to_line(block['spans']) |
|
sort_block_lines = line_sort_spans_by_left_to_right(block_lines) |
|
block['lines'] = sort_block_lines |
|
del block['spans'] |
|
return block |
|
|
|
def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2): |
|
""" |
|
计算box1和box2的重叠面积占bbox1的比例 |
|
""" |
|
|
|
x_left = max(bbox1[0], bbox2[0]) |
|
y_top = max(bbox1[1], bbox2[1]) |
|
x_right = min(bbox1[2], bbox2[2]) |
|
y_bottom = min(bbox1[3], bbox2[3]) |
|
|
|
if x_right < x_left or y_bottom < y_top: |
|
return 0.0 |
|
|
|
|
|
intersection_area = (x_right - x_left) * (y_bottom - y_top) |
|
bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]) |
|
if bbox1_area == 0: |
|
return 0 |
|
else: |
|
return intersection_area / bbox1_area |
|
|
|
def fill_spans_in_blocks(blocks, spans, radio): |
|
''' |
|
将allspans中的span按位置关系,放入blocks中 |
|
''' |
|
block_with_spans = [] |
|
for block in blocks: |
|
block_type = block["category_type"] |
|
L = block['poly'][0] |
|
U = block['poly'][1] |
|
R = block['poly'][2] |
|
D = block['poly'][5] |
|
L, R = min(L, R), max(L, R) |
|
U, D = min(U, D), max(U, D) |
|
block_bbox = [L, U, R, D] |
|
block_dict = { |
|
'type': block_type, |
|
'bbox': block_bbox, |
|
'saved_info': block |
|
} |
|
block_spans = [] |
|
for span in spans: |
|
span_bbox = span["bbox"] |
|
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio: |
|
block_spans.append(span) |
|
|
|
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)''' |
|
|
|
|
|
|
|
|
|
'''模型识别错误的行间公式, type类型转换成行内公式''' |
|
|
|
|
|
'''bbox去除粘连''' |
|
|
|
|
|
block_dict['spans'] = block_spans |
|
block_with_spans.append(block_dict) |
|
|
|
|
|
if len(block_spans) > 0: |
|
for span in block_spans: |
|
spans.remove(span) |
|
|
|
return block_with_spans, spans |
|
|
|
def fix_block_spans(block_with_spans): |
|
''' |
|
1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系 |
|
需要将caption和footnote的text_span放入相应img_block和table_block内的 |
|
caption_block和footnote_block中 |
|
2、同时需要删除block中的spans字段 |
|
''' |
|
fix_blocks = [] |
|
for block in block_with_spans: |
|
block_type = block['type'] |
|
|
|
|
|
|
|
|
|
|
|
if block_type == "isolate_formula": |
|
block = fix_interline_block(block) |
|
else: |
|
block = fix_text_block(block) |
|
fix_blocks.append(block) |
|
return fix_blocks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_lang(string): |
|
""" |
|
检查整个字符串是否包含中文 |
|
:param string: 需要检查的字符串 |
|
:return: bool |
|
""" |
|
|
|
for ch in string: |
|
if u'\u4e00' <= ch <= u'\u9fff': |
|
return 'zh' |
|
return 'en' |
|
|
|
def ocr_escape_special_markdown_char(content): |
|
""" |
|
转义正文里对markdown语法有特殊意义的字符 |
|
""" |
|
special_chars = ["*", "`", "~", "$"] |
|
for char in special_chars: |
|
content = content.replace(char, "\\" + char) |
|
|
|
return content |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_para_with_text(para_block): |
|
para_text = '' |
|
for line in para_block['lines']: |
|
line_text = "" |
|
line_lang = "" |
|
for span in line['spans']: |
|
span_type = span['type'] |
|
if span_type == "text": |
|
line_text += span['content'].strip() |
|
if line_text != "": |
|
line_lang = detect_lang(line_text) |
|
for span in line['spans']: |
|
span_type = span['type'] |
|
content = '' |
|
if span_type == "text": |
|
content = span['content'] |
|
content = ocr_escape_special_markdown_char(content) |
|
|
|
|
|
|
|
|
|
|
|
elif span_type == 'inline': |
|
content = f" ${span['content'].strip('$')}$ " |
|
elif span_type == 'ignore-formula': |
|
content = f" ${span['content'].strip('$')}$ " |
|
elif span_type == 'isolated': |
|
content = f"\n$$\n{span['content'].strip('$')}\n$$\n" |
|
elif span_type == 'footnote': |
|
content_ori = span['content'].strip('$') |
|
if '^' in content_ori: |
|
content = f" ${content_ori}$ " |
|
else: |
|
content = f" $^{content_ori}$ " |
|
|
|
if content != '': |
|
if 'zh' in line_lang: |
|
para_text += content.strip() |
|
else: |
|
para_text += content.strip() + ' ' |
|
return para_text |