Rajendransp133's picture
Upload 86 files
ac901c7 verified
#
# Copyright (c) 2013-present, Anoop Kunchukuttan
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Program for detokenizing Indian language input
#
# @author Anoop Kunchukuttan
#
"""
De-tokenizer for Indian languages.
"""
import regex as re
## detokenizer patterns
left_attach = r"!%)\]},.:;>?\u0964\u0965"
pat_la = re.compile(r"[ ]([" + left_attach + r"])")
right_attach = r"#$(\[{<@"
pat_ra = re.compile(r"([" + right_attach + r"])[ ]")
lr_attach = r"-/\\"
pat_lra = re.compile(r"[ ]([" + lr_attach + r"])[ ]")
# donknow=u'&*+=^_|~'
## date, numbers, section/article numbering
## TODO: handle indic numbers
pat_num_seq = re.compile(r"([0-9]+ [,.:/] )+[0-9]+")
### e-mail address
# pat_num=re.compile(ur'[a-zA-Z]+[ ]?
def trivial_detokenize_indic(text):
"""detokenize string for Indian language scripts using Brahmi-derived scripts
A trivial detokenizer which:
- decides whether punctuation attaches to left/right or both
- handles number sequences
- handles quotes smartly (deciding left or right attachment)
Args:
text (str): tokenized text to process
Returns:
str: detokenized string
"""
s = text
### some normalizations
# numbers and dates
new_s = ""
prev = 0
for m in pat_num_seq.finditer(s):
start = m.start()
end = m.end()
if start > prev:
new_s = new_s + s[prev:start]
new_s = new_s + s[start:end].replace(" ", "")
prev = end
new_s = new_s + s[prev:]
s = new_s
### consective single quotes or backslashes become double quotes
# s=s.replace("' '", "''")
# s=s.replace("` `", '``')
s = pat_lra.sub("\\1", s)
s = pat_la.sub("\\1", s)
s = pat_ra.sub("\\1", s)
# assumes well formedness of quotes and alternates between right and left attach
alt_attach = "'\"`"
for punc in alt_attach:
cnt = 0
out_str = []
for c in s:
if c == punc:
if cnt % 2 == 0:
out_str.append("@RA")
else:
out_str.append("@LA")
cnt += 1
else:
out_str.append(c)
s = (
"".join(out_str)
.replace("@RA ", punc)
.replace(" @LA", punc)
.replace("@RA", punc)
.replace("@LA", punc)
)
return s
def trivial_detokenize(text, lang="hi"):
"""detokenize string for languages of the Indian subcontinent
A trivial detokenizer which:
- decides whether punctuation attaches to left/right or both
- handles number sequences
- handles quotes smartly (deciding left or right attachment)
Args:
text (str): tokenized text to process
Returns:
str: detokenized string
Raises:
IndicNlpException: If language is not supported
"""
return trivial_detokenize_indic(text)