Spaces:
Paused
Paused
| def getIndex(inputData): | |
| ''' | |
| get input index for lists and dicts. | |
| ''' | |
| inputType = type(inputData) | |
| if inputType is dict: | |
| return list(inputData.keys()) | |
| elif inputType is list: | |
| return range(len(inputData)) | |
| return 0 | |
| def getSonNodes(nodeData, nodeName): | |
| ''' | |
| Given a nodeData object and a nodeName string, | |
| returns a list of tuples containing the child | |
| nodes of the given node and their corresponding | |
| names. | |
| ''' | |
| index = getIndex(nodeData) | |
| ans = [] | |
| if type(nodeData) is list: | |
| for i in index: | |
| ans.append((nodeData[i], nodeName)) | |
| elif type(nodeData) is dict: | |
| for i in index: | |
| ans.append((nodeData[i], i)) | |
| return ans | |
| def docRead(sonData, sonName): | |
| ''' | |
| Given a sonData object and its corresponding | |
| sonName string, returns a string representation | |
| of the data. | |
| Returns: | |
| - If the sonData object is a string, its value | |
| will be returned. | |
| - If the sonData object is not a string, the | |
| recRead function will be called recursively | |
| to construct the string. | |
| ''' | |
| ans = '' | |
| dataType = type(sonData) | |
| if dataType is str: | |
| ans += sonData + '\n' | |
| else: | |
| ans += recRead(sonData, sonName) | |
| return ans | |
| def recRead(data, key): | |
| ''' | |
| Notes: | |
| - This function assumes that the data object | |
| is a dictionary or list. | |
| - This function is called recursively to traverse | |
| the nested structure of the data object and | |
| construct the string representation. | |
| - The function filters out certain stop words and | |
| keywords defined in the stopwordList and keywordList | |
| variables, respectively. | |
| - The docRead function is called to construct the | |
| string representation of each keyword object found. | |
| ''' | |
| ans = '' | |
| stopwordList = ['ref', 'figure', 'idno', 'listBibl', 'note'] | |
| keywordList = ['head', '#text', 'p', 'surname'] | |
| if getIndex(data): | |
| for son, father in getSonNodes(data, key): | |
| if father == 'abstract': | |
| ans += 'ABSTRACT\n' | |
| if father in stopwordList: | |
| continue | |
| elif father in keywordList: | |
| ans += docRead(son, father) | |
| else: | |
| ans += recRead(son, father) | |
| return ans | |
| def splitBody(article): | |
| article = article.split('\n') | |
| parts = ['head', 'body', 'tail'] | |
| pointer = 0 | |
| ans = dict() | |
| for block in article: | |
| if block.upper() == 'ABSTRACT' or block.upper() == 'REFERENCE': | |
| pointer += 1 | |
| if parts[pointer] not in ans.keys(): | |
| ans[parts[pointer]] = [] | |
| ans[parts[pointer]].append(block) | |
| return ans | |
| def bodyMerge(article): | |
| body = article['body'] | |
| ans = dict() | |
| keyList = [] | |
| for block in body: | |
| allowed = set('0123456789.qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM') | |
| if set(block) <= allowed and len(block) < 32 and len(block) > 0: | |
| key = block | |
| keyList.append(key) | |
| ans[key] = [] | |
| else: | |
| ans[key].append(block) | |
| return ans, keyList |