Spaces:
Running
Running
from bs4 import BeautifulSoup | |
import re | |
def is_html_table(text): | |
soup = BeautifulSoup(text, "html.parser") | |
return soup.find('table') is not None | |
def table_matrix2html(matrix_table): | |
soup = BeautifulSoup(matrix_table, 'html.parser') | |
table = soup.find('table') | |
rownum = 0 | |
colnum = 0 | |
cell_dict = {} | |
rid = 0 | |
for tr in table.find_all('tr'): | |
cid = 0 | |
for td in tr.find_all('td'): | |
if td.find('l'): | |
cell_dict[(rid, cid)] = '<l>' | |
elif td.find('t'): | |
cell_dict[(rid, cid)] = '<t>' | |
elif td.find('lt'): | |
cell_dict[(rid, cid)] = '<lt>' | |
else: | |
text = td.get_text(strip=True) | |
cell_dict[(rid, cid)] = text | |
cid += 1 | |
if colnum == 0: | |
colnum = cid | |
elif cid != colnum: | |
raise Exception('colnum not match') | |
rid += 1 | |
rownum = rid | |
html_table = ['<table>'] | |
for rid in range(rownum): | |
html_table.append('<tr>') | |
for cid in range(colnum): | |
if (rid, cid) not in cell_dict.keys(): | |
continue | |
text = cell_dict[(rid, cid)] | |
if text == '<l>' or text == '<t>' or text == '<lt>': | |
raise Exception('cell not match') | |
rowspan = 1 | |
colspan = 1 | |
for r in range(rid+1, rownum): | |
if (r, cid) in cell_dict.keys() and cell_dict[(r, cid)] == '<t>': | |
rowspan += 1 | |
del cell_dict[(r, cid)] | |
else: | |
break | |
for c in range(cid+1, colnum): | |
if (rid, c) in cell_dict.keys() and cell_dict[(rid, c)] == '<l>': | |
colspan += 1 | |
del cell_dict[(rid, c)] | |
else: | |
break | |
for r in range(rid+1, rid+rowspan): | |
for c in range(cid+1, cid+colspan): | |
if cell_dict[(r, c)] != '<lt>': | |
raise Exception('cell not match') | |
del cell_dict[(r, c)] | |
attr = '' | |
if rowspan > 1: | |
attr += ' rowspan="{}"'.format(rowspan) | |
if colspan > 1: | |
attr += ' colspan="{}"'.format(colspan) | |
html_table.append("<td{}>{}</td>".format(attr, text)) | |
html_table.append('</tr>') | |
html_table.append('</table>') | |
return "".join(html_table) | |
def table_html2matrix(html_table): | |
soup = BeautifulSoup(html_table, 'html.parser') | |
table = soup.find('table') | |
rownum = len(table.find_all('tr')) | |
colnum = 0 | |
tr = table.find_all('tr')[0] | |
for td in tr.find_all('td'): | |
colnum += td.get('colspan', 1) | |
matrix = [[None for _ in range(colnum)] for _ in range(rownum)] | |
rid = 0 | |
for tr in table.find_all('tr'): | |
cid = 0 | |
for td in tr.find_all('td'): | |
for c in range(cid, colnum): | |
if matrix[rid][c] is None: | |
break | |
cid = c | |
rowspan = td.get('rowspan', 1) | |
colspan = td.get('colspan', 1) | |
cell_text = td.get_text(strip=True) | |
for r in range(rid,rid+rowspan): | |
if r >= rownum: | |
raise Exception('rownum not match') | |
for c in range(cid,cid+colspan): | |
if c >= colnum: | |
raise Exception('colnum not match') | |
if matrix[r][c] is not None: | |
raise Exception('cell not match') | |
if r == rid and c == cid: | |
matrix[r][c] = cell_text | |
elif r == rid: | |
matrix[r][c] = '<l>' | |
elif c == cid: | |
matrix[r][c] = '<t>' | |
else: | |
matrix[r][c] = '<lt>' | |
cid += colspan | |
rid += 1 | |
matrix_table = ['<table>'] | |
for rid in range(rownum): | |
matrix_table.append('<tr>') | |
for cid in range(colnum): | |
matrix_table.append('<td>') | |
cell_text = matrix[rid][cid] | |
matrix_table.append(cell_text) | |
matrix_table.append('</td>') | |
matrix_table.append('</tr>') | |
matrix_table.append('</table>') | |
return "".join(matrix_table) | |
trans_func = { | |
"html2matrix": table_html2matrix, | |
"matrix2html": table_matrix2html, | |
} | |
def trans_markdown_text(markdown_text,trans_type): | |
if markdown_text == None: | |
return None | |
text_list = markdown_text.split('\n\n') | |
for i,text in enumerate(text_list): | |
if is_html_table(text): | |
text_list[i] = trans_func[trans_type](text) | |
return "\n\n".join(text_list) | |