Spaces:

ml6team
/

doc-to-slides

Paused

App Files Files Community

doc-to-slides / utils.py

com3dian

Create utils.py

2654a92 verified over 1 year ago

raw

history blame

3.15 kB

	def getIndex(inputData):
	'''
	get input index for lists and dicts.
	'''
	inputType = type(inputData)
	if inputType is dict:
	return list(inputData.keys())
	elif inputType is list:
	return range(len(inputData))
	return 0

	def getSonNodes(nodeData, nodeName):
	'''
	Given a nodeData object and a nodeName string,
	returns a list of tuples containing the child
	nodes of the given node and their corresponding
	names.
	'''
	index = getIndex(nodeData)
	ans = []
	if type(nodeData) is list:
	for i in index:
	ans.append((nodeData[i], nodeName))
	elif type(nodeData) is dict:
	for i in index:
	ans.append((nodeData[i], i))
	return ans

	def docRead(sonData, sonName):
	'''
	Given a sonData object and its corresponding
	sonName string, returns a string representation
	of the data.
	Returns:
	- If the sonData object is a string, its value
	will be returned.
	- If the sonData object is not a string, the
	recRead function will be called recursively
	to construct the string.
	'''
	ans = ''
	dataType = type(sonData)
	if dataType is str:
	ans += sonData + '\n'
	else:
	ans += recRead(sonData, sonName)
	return ans

	def recRead(data, key):
	'''
	Notes:
	- This function assumes that the data object
	is a dictionary or list.
	- This function is called recursively to traverse
	the nested structure of the data object and
	construct the string representation.
	- The function filters out certain stop words and
	keywords defined in the stopwordList and keywordList
	variables, respectively.
	- The docRead function is called to construct the
	string representation of each keyword object found.
	'''
	ans = ''
	stopwordList = ['ref', 'figure', 'idno', 'listBibl', 'note']
	keywordList = ['head', '#text', 'p', 'surname']

	if getIndex(data):
	for son, father in getSonNodes(data, key):
	if father == 'abstract':
	ans += 'ABSTRACT\n'
	if father in stopwordList:
	continue
	elif father in keywordList:
	ans += docRead(son, father)
	else:
	ans += recRead(son, father)
	return ans

	def splitBody(article):
	article = article.split('\n')
	parts = ['head', 'body', 'tail']
	pointer = 0
	ans = dict()
	for block in article:
	if block.upper() == 'ABSTRACT' or block.upper() == 'REFERENCE':
	pointer += 1

	if parts[pointer] not in ans.keys():
	ans[parts[pointer]] = []

	ans[parts[pointer]].append(block)
	return ans

	def bodyMerge(article):
	body = article['body']
	ans = dict()
	keyList = []
	for block in body:
	allowed = set('0123456789.qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM')
	if set(block) <= allowed and len(block) < 32 and len(block) > 0:
	key = block
	keyList.append(key)
	ans[key] = []
	else:
	ans[key].append(block)
	return ans, keyList