Spaces:

Rajendransp133
/

microservice-NMT

Sleeping

App Files Files Community

microservice-NMT / libs /indic_nlp_library /indicnlp /tokenize /indic_detokenize.py

Rajendransp133

Upload 86 files

ac901c7 verified 5 months ago

raw

history blame contribute delete

3.18 kB

	#
	# Copyright (c) 2013-present, Anoop Kunchukuttan
	# All rights reserved.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.
	#

	# Program for detokenizing Indian language input
	#
	# @author Anoop Kunchukuttan
	#
	"""
	De-tokenizer for Indian languages.
	"""

	import regex as re

	## detokenizer patterns
	left_attach = r"!%)\]},.:;>?\u0964\u0965"
	pat_la = re.compile(r"[ ]([" + left_attach + r"])")

	right_attach = r"#$(\[{<@"
	pat_ra = re.compile(r"([" + right_attach + r"])[ ]")

	lr_attach = r"-/\\"
	pat_lra = re.compile(r"[ ]([" + lr_attach + r"])[ ]")

	# donknow=u'&*+=^_\|~'

	## date, numbers, section/article numbering
	## TODO: handle indic numbers
	pat_num_seq = re.compile(r"([0-9]+ [,.:/] )+[0-9]+")

	### e-mail address
	# pat_num=re.compile(ur'[a-zA-Z]+[ ]?


	def trivial_detokenize_indic(text):
	"""detokenize string for Indian language scripts using Brahmi-derived scripts

	A trivial detokenizer which:

	- decides whether punctuation attaches to left/right or both
	- handles number sequences
	- handles quotes smartly (deciding left or right attachment)

	Args:
	text (str): tokenized text to process

	Returns:
	str: detokenized string
	"""

	s = text
	### some normalizations

	# numbers and dates
	new_s = ""
	prev = 0
	for m in pat_num_seq.finditer(s):
	start = m.start()
	end = m.end()
	if start > prev:
	new_s = new_s + s[prev:start]
	new_s = new_s + s[start:end].replace(" ", "")
	prev = end

	new_s = new_s + s[prev:]
	s = new_s

	### consective single quotes or backslashes become double quotes
	# s=s.replace("' '", "''")
	# s=s.replace("` `", '``')

	s = pat_lra.sub("\\1", s)
	s = pat_la.sub("\\1", s)
	s = pat_ra.sub("\\1", s)

	# assumes well formedness of quotes and alternates between right and left attach

	alt_attach = "'\"`"
	for punc in alt_attach:
	cnt = 0
	out_str = []
	for c in s:
	if c == punc:
	if cnt % 2 == 0:
	out_str.append("@RA")
	else:
	out_str.append("@LA")
	cnt += 1
	else:
	out_str.append(c)

	s = (
	"".join(out_str)
	.replace("@RA ", punc)
	.replace(" @LA", punc)
	.replace("@RA", punc)
	.replace("@LA", punc)
	)

	return s


	def trivial_detokenize(text, lang="hi"):
	"""detokenize string for languages of the Indian subcontinent

	A trivial detokenizer which:

	- decides whether punctuation attaches to left/right or both
	- handles number sequences
	- handles quotes smartly (deciding left or right attachment)

	Args:
	text (str): tokenized text to process

	Returns:
	str: detokenized string

	Raises:
	IndicNlpException: If language is not supported
	"""
	return trivial_detokenize_indic(text)