OCRFlux / ocrflux /table_format.py
mirnaresearch's picture
Initial commit for HF Space (no images)
ca5b08e
from bs4 import BeautifulSoup
import re
def is_html_table(text):
soup = BeautifulSoup(text, "html.parser")
return soup.find('table') is not None
def table_matrix2html(matrix_table):
soup = BeautifulSoup(matrix_table, 'html.parser')
table = soup.find('table')
rownum = 0
colnum = 0
cell_dict = {}
rid = 0
for tr in table.find_all('tr'):
cid = 0
for td in tr.find_all('td'):
if td.find('l'):
cell_dict[(rid, cid)] = '<l>'
elif td.find('t'):
cell_dict[(rid, cid)] = '<t>'
elif td.find('lt'):
cell_dict[(rid, cid)] = '<lt>'
else:
text = td.get_text(strip=True)
cell_dict[(rid, cid)] = text
cid += 1
if colnum == 0:
colnum = cid
elif cid != colnum:
raise Exception('colnum not match')
rid += 1
rownum = rid
html_table = ['<table>']
for rid in range(rownum):
html_table.append('<tr>')
for cid in range(colnum):
if (rid, cid) not in cell_dict.keys():
continue
text = cell_dict[(rid, cid)]
if text == '<l>' or text == '<t>' or text == '<lt>':
raise Exception('cell not match')
rowspan = 1
colspan = 1
for r in range(rid+1, rownum):
if (r, cid) in cell_dict.keys() and cell_dict[(r, cid)] == '<t>':
rowspan += 1
del cell_dict[(r, cid)]
else:
break
for c in range(cid+1, colnum):
if (rid, c) in cell_dict.keys() and cell_dict[(rid, c)] == '<l>':
colspan += 1
del cell_dict[(rid, c)]
else:
break
for r in range(rid+1, rid+rowspan):
for c in range(cid+1, cid+colspan):
if cell_dict[(r, c)] != '<lt>':
raise Exception('cell not match')
del cell_dict[(r, c)]
attr = ''
if rowspan > 1:
attr += ' rowspan="{}"'.format(rowspan)
if colspan > 1:
attr += ' colspan="{}"'.format(colspan)
html_table.append("<td{}>{}</td>".format(attr, text))
html_table.append('</tr>')
html_table.append('</table>')
return "".join(html_table)
def table_html2matrix(html_table):
soup = BeautifulSoup(html_table, 'html.parser')
table = soup.find('table')
rownum = len(table.find_all('tr'))
colnum = 0
tr = table.find_all('tr')[0]
for td in tr.find_all('td'):
colnum += td.get('colspan', 1)
matrix = [[None for _ in range(colnum)] for _ in range(rownum)]
rid = 0
for tr in table.find_all('tr'):
cid = 0
for td in tr.find_all('td'):
for c in range(cid, colnum):
if matrix[rid][c] is None:
break
cid = c
rowspan = td.get('rowspan', 1)
colspan = td.get('colspan', 1)
cell_text = td.get_text(strip=True)
for r in range(rid,rid+rowspan):
if r >= rownum:
raise Exception('rownum not match')
for c in range(cid,cid+colspan):
if c >= colnum:
raise Exception('colnum not match')
if matrix[r][c] is not None:
raise Exception('cell not match')
if r == rid and c == cid:
matrix[r][c] = cell_text
elif r == rid:
matrix[r][c] = '<l>'
elif c == cid:
matrix[r][c] = '<t>'
else:
matrix[r][c] = '<lt>'
cid += colspan
rid += 1
matrix_table = ['<table>']
for rid in range(rownum):
matrix_table.append('<tr>')
for cid in range(colnum):
matrix_table.append('<td>')
cell_text = matrix[rid][cid]
matrix_table.append(cell_text)
matrix_table.append('</td>')
matrix_table.append('</tr>')
matrix_table.append('</table>')
return "".join(matrix_table)
trans_func = {
"html2matrix": table_html2matrix,
"matrix2html": table_matrix2html,
}
def trans_markdown_text(markdown_text,trans_type):
if markdown_text == None:
return None
text_list = markdown_text.split('\n\n')
for i,text in enumerate(text_list):
if is_html_table(text):
text_list[i] = trans_func[trans_type](text)
return "\n\n".join(text_list)