Spaces:

mirnaresearch
/

OCRFlux

Running

App Files Files Community

OCRFlux / ocrflux /table_format.py

mirnaresearch

Initial commit for HF Space (no images)

ca5b08e about 1 month ago

raw

history blame contribute delete

4.77 kB


	from bs4 import BeautifulSoup
	import re

	def is_html_table(text):
	soup = BeautifulSoup(text, "html.parser")
	return soup.find('table') is not None

	def table_matrix2html(matrix_table):
	soup = BeautifulSoup(matrix_table, 'html.parser')
	table = soup.find('table')
	rownum = 0
	colnum = 0
	cell_dict = {}
	rid = 0
	for tr in table.find_all('tr'):
	cid = 0
	for td in tr.find_all('td'):
	if td.find('l'):
	cell_dict[(rid, cid)] = '<l>'
	elif td.find('t'):
	cell_dict[(rid, cid)] = '<t>'
	elif td.find('lt'):
	cell_dict[(rid, cid)] = '<lt>'
	else:
	text = td.get_text(strip=True)
	cell_dict[(rid, cid)] = text
	cid += 1
	if colnum == 0:
	colnum = cid
	elif cid != colnum:
	raise Exception('colnum not match')
	rid += 1
	rownum = rid
	html_table = ['<table>']
	for rid in range(rownum):
	html_table.append('<tr>')
	for cid in range(colnum):
	if (rid, cid) not in cell_dict.keys():
	continue
	text = cell_dict[(rid, cid)]
	if text == '<l>' or text == '<t>' or text == '<lt>':
	raise Exception('cell not match')
	rowspan = 1
	colspan = 1
	for r in range(rid+1, rownum):
	if (r, cid) in cell_dict.keys() and cell_dict[(r, cid)] == '<t>':
	rowspan += 1
	del cell_dict[(r, cid)]
	else:
	break
	for c in range(cid+1, colnum):
	if (rid, c) in cell_dict.keys() and cell_dict[(rid, c)] == '<l>':
	colspan += 1
	del cell_dict[(rid, c)]
	else:
	break
	for r in range(rid+1, rid+rowspan):
	for c in range(cid+1, cid+colspan):
	if cell_dict[(r, c)] != '<lt>':
	raise Exception('cell not match')
	del cell_dict[(r, c)]
	attr = ''
	if rowspan > 1:
	attr += ' rowspan="{}"'.format(rowspan)
	if colspan > 1:
	attr += ' colspan="{}"'.format(colspan)
	html_table.append("<td{}>{}</td>".format(attr, text))
	html_table.append('</tr>')
	html_table.append('</table>')
	return "".join(html_table)

	def table_html2matrix(html_table):
	soup = BeautifulSoup(html_table, 'html.parser')
	table = soup.find('table')
	rownum = len(table.find_all('tr'))
	colnum = 0
	tr = table.find_all('tr')[0]
	for td in tr.find_all('td'):
	colnum += td.get('colspan', 1)
	matrix = [[None for _ in range(colnum)] for _ in range(rownum)]

	rid = 0
	for tr in table.find_all('tr'):
	cid = 0
	for td in tr.find_all('td'):
	for c in range(cid, colnum):
	if matrix[rid][c] is None:
	break
	cid = c
	rowspan = td.get('rowspan', 1)
	colspan = td.get('colspan', 1)
	cell_text = td.get_text(strip=True)
	for r in range(rid,rid+rowspan):
	if r >= rownum:
	raise Exception('rownum not match')
	for c in range(cid,cid+colspan):
	if c >= colnum:
	raise Exception('colnum not match')
	if matrix[r][c] is not None:
	raise Exception('cell not match')
	if r == rid and c == cid:
	matrix[r][c] = cell_text
	elif r == rid:
	matrix[r][c] = '<l>'
	elif c == cid:
	matrix[r][c] = '<t>'
	else:
	matrix[r][c] = '<lt>'
	cid += colspan
	rid += 1

	matrix_table = ['<table>']
	for rid in range(rownum):
	matrix_table.append('<tr>')
	for cid in range(colnum):
	matrix_table.append('<td>')
	cell_text = matrix[rid][cid]
	matrix_table.append(cell_text)
	matrix_table.append('</td>')
	matrix_table.append('</tr>')
	matrix_table.append('</table>')
	return "".join(matrix_table)

	trans_func = {
	"html2matrix": table_html2matrix,
	"matrix2html": table_matrix2html,
	}

	def trans_markdown_text(markdown_text,trans_type):
	if markdown_text == None:
	return None
	text_list = markdown_text.split('\n\n')
	for i,text in enumerate(text_list):
	if is_html_table(text):
	text_list[i] = trans_func[trans_type](text)
	return "\n\n".join(text_list)