# # Copyright (c) 2013-present, Anoop Kunchukuttan # All rights reserved. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # # Program for detokenizing Indian language input # # @author Anoop Kunchukuttan # """ De-tokenizer for Indian languages. """ import regex as re ## detokenizer patterns left_attach = r"!%)\]},.:;>?\u0964\u0965" pat_la = re.compile(r"[ ]([" + left_attach + r"])") right_attach = r"#$(\[{<@" pat_ra = re.compile(r"([" + right_attach + r"])[ ]") lr_attach = r"-/\\" pat_lra = re.compile(r"[ ]([" + lr_attach + r"])[ ]") # donknow=u'&*+=^_|~' ## date, numbers, section/article numbering ## TODO: handle indic numbers pat_num_seq = re.compile(r"([0-9]+ [,.:/] )+[0-9]+") ### e-mail address # pat_num=re.compile(ur'[a-zA-Z]+[ ]? def trivial_detokenize_indic(text): """detokenize string for Indian language scripts using Brahmi-derived scripts A trivial detokenizer which: - decides whether punctuation attaches to left/right or both - handles number sequences - handles quotes smartly (deciding left or right attachment) Args: text (str): tokenized text to process Returns: str: detokenized string """ s = text ### some normalizations # numbers and dates new_s = "" prev = 0 for m in pat_num_seq.finditer(s): start = m.start() end = m.end() if start > prev: new_s = new_s + s[prev:start] new_s = new_s + s[start:end].replace(" ", "") prev = end new_s = new_s + s[prev:] s = new_s ### consective single quotes or backslashes become double quotes # s=s.replace("' '", "''") # s=s.replace("` `", '``') s = pat_lra.sub("\\1", s) s = pat_la.sub("\\1", s) s = pat_ra.sub("\\1", s) # assumes well formedness of quotes and alternates between right and left attach alt_attach = "'\"`" for punc in alt_attach: cnt = 0 out_str = [] for c in s: if c == punc: if cnt % 2 == 0: out_str.append("@RA") else: out_str.append("@LA") cnt += 1 else: out_str.append(c) s = ( "".join(out_str) .replace("@RA ", punc) .replace(" @LA", punc) .replace("@RA", punc) .replace("@LA", punc) ) return s def trivial_detokenize(text, lang="hi"): """detokenize string for languages of the Indian subcontinent A trivial detokenizer which: - decides whether punctuation attaches to left/right or both - handles number sequences - handles quotes smartly (deciding left or right attachment) Args: text (str): tokenized text to process Returns: str: detokenized string Raises: IndicNlpException: If language is not supported """ return trivial_detokenize_indic(text)