File size: 4,773 Bytes
ca5b08e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

from bs4 import BeautifulSoup
import re

def is_html_table(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.find('table') is not None

def table_matrix2html(matrix_table):
    soup = BeautifulSoup(matrix_table, 'html.parser')
    table = soup.find('table')
    rownum = 0
    colnum = 0
    cell_dict = {}
    rid = 0
    for tr in table.find_all('tr'):
        cid = 0
        for td in tr.find_all('td'):
            if td.find('l'):
                cell_dict[(rid, cid)] = '<l>'
            elif td.find('t'):
                cell_dict[(rid, cid)] = '<t>'
            elif td.find('lt'):
                cell_dict[(rid, cid)] = '<lt>'
            else:
                text = td.get_text(strip=True)
                cell_dict[(rid, cid)] = text
            cid += 1
        if colnum == 0:
            colnum = cid
        elif cid != colnum:
            raise Exception('colnum not match')
        rid += 1
    rownum = rid
    html_table = ['<table>']
    for rid in range(rownum):
        html_table.append('<tr>')
        for cid in range(colnum):
            if (rid, cid) not in cell_dict.keys():
                continue
            text = cell_dict[(rid, cid)]
            if text == '<l>' or text == '<t>' or text == '<lt>':
                raise Exception('cell not match')
            rowspan = 1
            colspan = 1
            for r in range(rid+1, rownum):
                if (r, cid) in cell_dict.keys() and cell_dict[(r, cid)] == '<t>':
                    rowspan += 1
                    del cell_dict[(r, cid)]
                else:
                    break
            for c in range(cid+1, colnum):
                if (rid, c) in cell_dict.keys() and cell_dict[(rid, c)] == '<l>':
                    colspan += 1
                    del cell_dict[(rid, c)]
                else:
                    break
            for r in range(rid+1, rid+rowspan):
                for c in range(cid+1, cid+colspan):
                    if cell_dict[(r, c)] != '<lt>':
                        raise Exception('cell not match')
                    del cell_dict[(r, c)]
            attr = ''
            if rowspan > 1:
                attr += ' rowspan="{}"'.format(rowspan)
            if colspan > 1:
                attr += ' colspan="{}"'.format(colspan)
            html_table.append("<td{}>{}</td>".format(attr, text))
        html_table.append('</tr>')
    html_table.append('</table>')
    return "".join(html_table)

def table_html2matrix(html_table):
    soup = BeautifulSoup(html_table, 'html.parser')
    table = soup.find('table')
    rownum = len(table.find_all('tr'))
    colnum = 0
    tr = table.find_all('tr')[0]
    for td in tr.find_all('td'):
        colnum += td.get('colspan', 1)
    matrix = [[None for _ in range(colnum)] for _ in range(rownum)]

    rid = 0
    for tr in table.find_all('tr'):
        cid = 0
        for td in tr.find_all('td'):
            for c in range(cid, colnum):
                if matrix[rid][c] is None:
                    break
            cid = c
            rowspan = td.get('rowspan', 1)
            colspan = td.get('colspan', 1)
            cell_text = td.get_text(strip=True)
            for r in range(rid,rid+rowspan):
                if r >= rownum:
                    raise Exception('rownum not match')
                for c in range(cid,cid+colspan):
                    if c >= colnum:
                        raise Exception('colnum not match')
                    if matrix[r][c] is not None:
                        raise Exception('cell not match')
                    if r == rid and c == cid:
                        matrix[r][c] = cell_text
                    elif r == rid:
                        matrix[r][c] = '<l>'
                    elif c == cid:
                        matrix[r][c] = '<t>'
                    else:
                        matrix[r][c] = '<lt>'
            cid += colspan
        rid += 1
    
    matrix_table = ['<table>']
    for rid in range(rownum):
        matrix_table.append('<tr>')
        for cid in range(colnum):
            matrix_table.append('<td>')
            cell_text = matrix[rid][cid]
            matrix_table.append(cell_text)
            matrix_table.append('</td>')
        matrix_table.append('</tr>')
    matrix_table.append('</table>')
    return "".join(matrix_table)

trans_func = {
    "html2matrix": table_html2matrix,
    "matrix2html": table_matrix2html,
}
      
def trans_markdown_text(markdown_text,trans_type):
    if markdown_text == None:
        return None
    text_list = markdown_text.split('\n\n')
    for i,text in enumerate(text_list):
        if is_html_table(text):
            text_list[i] = trans_func[trans_type](text)
    return "\n\n".join(text_list)