File size: 8,433 Bytes
3215d8d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import os
PWD = os.path.dirname(__file__)
import re
import regex
import json
import traceback
from nemo_text_processing.text_normalization.normalize import Normalizer
from indic_numtowords import num2words, supported_langs
from .translator import GoogleTranslator
indic_acronym_matcher = regex.compile(r"([\p{L}\p{M}]+\.\s*){2,}")
# short_form_regex = re.compile(r'\b[A-Z\.]{2,}s?\b')
# def get_shortforms_from_string(text):
# return short_form_regex.findall(text)
short_form_regex = re.compile(r"\b([A-Z][\.\s]+)+([A-Z])?\b")
eng_consonants_regex = re.compile(r"\b[BCDFGHJKLMNPQRSTVWXZbcdfghjklmnpqrstvwxz]+\b")
def get_shortforms_from_string(text):
dotted_shortforms = [m.group() for m in re.finditer(short_form_regex, text)]
non_dotted_shortforms = [m.group() for m in re.finditer(eng_consonants_regex, text)]
return dotted_shortforms + non_dotted_shortforms
decimal_str_regex = re.compile("\d{1,3}(?:(?:,\d{2,3}){1,3}|(?:\d{1,7}))?(?:\.\d+)")
def get_all_decimals_from_string(text):
return decimal_str_regex.findall(text)
num_str_regex = re.compile("\d{1,3}(?:(?:,\d{2,3}){1,3}|(?:\d{1,7}))?(?:\.\d+)?")
def get_all_numbers_from_string(text):
return num_str_regex.findall(text)
multiple_stops_regex = r'\.\.+'
def replace_multiple_stops(text):
return re.sub(multiple_stops_regex, '.', text)
date_generic_match_regex = re.compile("(?:[^0-9]\d*[./-]\d*[./-]\d*)")
date_str_regex = re.compile("(?:\d{1,2}[./-]\d{1,2}[./-]\d{2,4})|(?:\d{2,4}[./-]\d{1,2}[./-]\d{1,2})") # match like dd/mm/yyyy or dd-mm-yy or yyyy.mm.dd or yy/mm/dd
def get_all_dates_from_string(text):
candidates = date_generic_match_regex.findall(text)
candidates = [c.replace(' ', '') for c in candidates]
candidates = [c for c in candidates if len(c) <= 10] # Prune invalid dates
candidates = ' '.join(candidates)
return date_str_regex.findall(candidates)
def get_decimal_substitution(decimal):
decimal_parts = decimal.split('.')
l_part = decimal_parts[0]
r_part = ""
for part in decimal_parts[1:]:
r_part += ' '.join(list(part)) # space between every digit after decimal point
decimal_sub = l_part + " point " + r_part
decimal_sub = decimal_sub.strip()
return decimal_sub
email_regex = r'[\w.+-]+@[\w-]+\.[\w.-]+'
url_regex = r'((?:\w+://)?\w+\.\w+\.\w+/?[\w\.\?=#]*)|(\w*.com/?[\w\.\?=#]*)'
currency_regex = r"\₹\ ?[+-]?[0-9]{1,3}(?:,?[0-9])*(?:\.[0-9]{1,2})?"
phone_regex = r'\+?\d[ \d-]{6,12}\d'
class TextNormalizer:
def __init__(self):
self.translator = GoogleTranslator()
self.normalizer = Normalizer(input_case='cased', lang='en')
self.symbols2lang2word = json.load(open(os.path.join(PWD, "symbols.json"), encoding="utf-8"))
self.alphabet2phone = json.load(open(os.path.join(PWD, "alphabet2phone.json"), encoding="utf-8"))
def normalize_text(self, text, lang):
text = text.replace("।", ".").replace("|", ".").replace("꯫", ".").strip()
text = self.expand_shortforms(text, lang)
text = self.normalize_decimals(text, lang)
text = self.replace_punctutations(text, lang)
text = self.convert_dates_to_words(text, lang)
text = self.convert_symbols_to_words(text, lang)
text = self.convert_numbers_to_words(text, lang)
return text
def normalize_decimals(self, text, lang):
decimal_strs = get_all_decimals_from_string(text)
if not decimal_strs:
return text
decimals = [str(decimal_str.replace(',', '')) for decimal_str in decimal_strs]
decimal_substitutions = [get_decimal_substitution(decimal) for decimal in decimals]
for decimal_str, decimal_sub in zip(decimal_strs, decimal_substitutions):
text = text.replace(decimal_str, decimal_sub)
return text
def replace_punctutations(self, text, lang):
text = replace_multiple_stops(text)
if lang not in ['brx', 'or']:
text = text.replace('।', '.')
if text[-1] not in ['.', '!', '?', ',', ':', ';']:
text = text + ' .'
else:
text = text.replace('.', '।')
text = text.replace('|', '.')
for bracket in ['(', ')', '{', '}', '[', ']']:
text = text.replace(bracket, ',')
# text = text.replace(':', ',').replace(';',',')
text = text.replace(';',',')
return text
def convert_numbers_to_words(self, text, lang):
num_strs = get_all_numbers_from_string(text)
if not num_strs:
return text
# TODO: If it is a large integer without commas (say >5 digits), spell it out numeral by numeral
# NOTE: partially handled by phones
numbers = [int(num_str.replace(',', '')) for num_str in num_strs]
if lang in supported_langs:
# print(lang, numbers)
num_words = [num2words(num, lang=lang) for num in numbers]
else: # Fallback, converting to Indian-English, followed by NMT
try:
num_words = [num2words(num, lang="en") for num in numbers]
translated_num_words = [self.translator(text=num_word, from_lang="en", to_lang=lang) for num_word in num_words]
# TODO: Cache the results?
num_words = translated_num_words
except:
traceback.print_exc()
for num_str, num_word in zip(num_strs, num_words):
text = text.replace(num_str, ' '+num_word+' ', 1)
return text.replace(" ", ' ')
def convert_dates_to_words(self, text, lang):
date_strs = get_all_dates_from_string(text)
if not date_strs:
return text
for date_str in date_strs:
normalized_str = self.normalizer.normalize(date_str, verbose=False, punct_post_process=True)
if lang in ['brx', 'en']: # no translate
translated_str = normalized_str
else:
translated_str = self.translator(text=normalized_str, from_lang="en", to_lang=lang)
text = text.replace(date_str, translated_str)
return text
def expand_phones(self, item):
return ' '.join(list(item))
def find_valid(self, regex_str, text):
items = re.findall(regex_str, text)
return_items = []
for item in items:
if isinstance(item, tuple):
for subitem in item:
if len(subitem) > 0:
return_items.append(subitem)
break # choose first valid sub item
elif len(item) > 0:
return_items.append(item)
return return_items
def convert_symbols_to_words(self, text, lang):
symbols = self.symbols2lang2word.keys()
emails = self.find_valid(email_regex, text)
# urls = re.findall(r'(?:\w+://)?\w+\.\w+\.\w+/?[\w\.\?=#]*', text)
urls = self.find_valid(url_regex, text)
# print('URLS', urls)
for item in emails + urls:
item_norm = item
for symbol in symbols:
item_norm = item_norm.replace(symbol, f' {self.symbols2lang2word[symbol][lang]} ')
text = text.replace(item, item_norm)
currencies = self.find_valid(currency_regex, text)
for item in currencies:
item_norm = item.replace('₹','') + '₹' # Pronounce after numerals
for symbol in symbols:
item_norm = item_norm.replace(symbol, f' {self.symbols2lang2word[symbol][lang]} ')
text = text.replace(item, item_norm)
phones = self.find_valid(phone_regex, text)
for item in phones:
item_norm = item.replace('-', ' ')
for symbol in symbols:
item_norm = item_norm.replace(symbol, f' {self.symbols2lang2word[symbol][lang]} ')
item_norm = self.expand_phones(item_norm)
text = text.replace(item, item_norm)
# percentage
text = text.replace('%', self.symbols2lang2word['%'][lang])
return text
def convert_char2phone(self, char):
return self.alphabet2phone[char.lower()] if char.lower() in self.alphabet2phone else ''
def expand_shortforms(self, text, lang):
if lang!='en':
# Remove dots, as it speaks out like each letter is separate sentence
# Example: अई. अई. टी. -> अई अई टी
for match in regex.finditer(indic_acronym_matcher, text):
match = match.group()
match_without_dot = match.replace('.', ' ')
text = text.replace(match, match_without_dot)
return text
shortforms = get_shortforms_from_string(text)
for shortform in shortforms:
shortform = shortform.strip()
if shortform == 'I' or shortform == "A":
# Skip if valid English words
continue
expanded = ' '.join([self.convert_char2phone(char) for char in shortform])
text = text.replace(shortform, expanded, 1)
return text
|