6L-TTS / common /text /acronyms.py
kathiasi's picture
Upload 100 files
16f0ad7 verified
import re
from . import cmudict
_letter_to_arpabet = {
'A': 'EY1',
'B': 'B IY1',
'C': 'S IY1',
'D': 'D IY1',
'E': 'IY1',
'F': 'EH1 F',
'G': 'JH IY1',
'H': 'EY1 CH',
'I': 'AY1',
'J': 'JH EY1',
'K': 'K EY1',
'L': 'EH1 L',
'M': 'EH1 M',
'N': 'EH1 N',
'O': 'OW1',
'P': 'P IY1',
'Q': 'K Y UW1',
'R': 'AA1 R',
'S': 'EH1 S',
'T': 'T IY1',
'U': 'Y UW1',
'V': 'V IY1',
'X': 'EH1 K S',
'Y': 'W AY1',
'W': 'D AH1 B AH0 L Y UW0',
'Z': 'Z IY1',
's': 'Z'
}
# Acronyms that should not be expanded
hardcoded_acronyms = [
'BMW', 'MVD', 'WDSU', 'GOP', 'UK', 'AI', 'GPS', 'BP', 'FBI', 'HD',
'CES', 'LRA', 'PC', 'NBA', 'BBL', 'OS', 'IRS', 'SAC', 'UV', 'CEO', 'TV',
'CNN', 'MSS', 'GSA', 'USSR', 'DNA', 'PRS', 'TSA', 'US', 'GPU', 'USA',
'FPCC', 'CIA']
# Words and acronyms that should be read as regular words, e.g., NATO, HAPPY, etc.
uppercase_whiteliset = []
acronyms_exceptions = {
'NVIDIA': 'N.VIDIA',
}
non_uppercase_exceptions = {
'email': 'e-mail',
}
# must ignore roman numerals
_acronym_re = re.compile(r'([a-z]*[A-Z][A-Z]+)s?\.?')
_non_uppercase_re = re.compile(r'\b({})\b'.format('|'.join(non_uppercase_exceptions.keys())), re.IGNORECASE)
def _expand_acronyms_to_arpa(m, add_spaces=True):
acronym = m.group(0)
# remove dots if they exist
acronym = re.sub('\.', '', acronym)
acronym = "".join(acronym.split())
arpabet = cmudict.lookup(acronym)
if arpabet is None:
acronym = list(acronym)
arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
# temporary fix
if arpabet[-1] == '{Z}' and len(arpabet) > 1:
arpabet[-2] = arpabet[-2][:-1] + ' ' + arpabet[-1][1:]
del arpabet[-1]
arpabet = ' '.join(arpabet)
elif len(arpabet) == 1:
arpabet = "{" + arpabet[0] + "}"
else:
arpabet = acronym
return arpabet
def normalize_acronyms(text):
text = re.sub(_acronym_re, _expand_acronyms_to_arpa, text)
return text
def expand_acronyms(m):
text = m.group(1)
if text in acronyms_exceptions:
text = acronyms_exceptions[text]
elif text in uppercase_whiteliset:
text = text
else:
text = '.'.join(text) + '.'
if 's' in m.group(0):
text = text + '\'s'
if text[-1] != '.' and m.group(0)[-1] == '.':
return text + '.'
else:
return text
def spell_acronyms(text):
text = re.sub(_non_uppercase_re, lambda m: non_uppercase_exceptions[m.group(0).lower()], text)
text = re.sub(_acronym_re, expand_acronyms, text)
return text