Spaces:

divvun-tts
/

multi-sami

Sleeping

katrihiovain

initial commit

8146713 6 months ago

2.67 kB

	import re
	from . import cmudict

	_letter_to_arpabet = {
	'A': 'EY1',
	'B': 'B IY1',
	'C': 'S IY1',
	'D': 'D IY1',
	'E': 'IY1',
	'F': 'EH1 F',
	'G': 'JH IY1',
	'H': 'EY1 CH',
	'I': 'AY1',
	'J': 'JH EY1',
	'K': 'K EY1',
	'L': 'EH1 L',
	'M': 'EH1 M',
	'N': 'EH1 N',
	'O': 'OW1',
	'P': 'P IY1',
	'Q': 'K Y UW1',
	'R': 'AA1 R',
	'S': 'EH1 S',
	'T': 'T IY1',
	'U': 'Y UW1',
	'V': 'V IY1',
	'X': 'EH1 K S',
	'Y': 'W AY1',
	'W': 'D AH1 B AH0 L Y UW0',
	'Z': 'Z IY1',
	's': 'Z'
	}

	# Acronyms that should not be expanded
	hardcoded_acronyms = [
	'BMW', 'MVD', 'WDSU', 'GOP', 'UK', 'AI', 'GPS', 'BP', 'FBI', 'HD',
	'CES', 'LRA', 'PC', 'NBA', 'BBL', 'OS', 'IRS', 'SAC', 'UV', 'CEO', 'TV',
	'CNN', 'MSS', 'GSA', 'USSR', 'DNA', 'PRS', 'TSA', 'US', 'GPU', 'USA',
	'FPCC', 'CIA']

	# Words and acronyms that should be read as regular words, e.g., NATO, HAPPY, etc.
	uppercase_whiteliset = []

	acronyms_exceptions = {
	'NVIDIA': 'N.VIDIA',
	}

	non_uppercase_exceptions = {
	'email': 'e-mail',
	}

	# must ignore roman numerals
	_acronym_re = re.compile(r'([a-z]*[A-Z][A-Z]+)s?\.?')
	_non_uppercase_re = re.compile(r'\b({})\b'.format('\|'.join(non_uppercase_exceptions.keys())), re.IGNORECASE)


	def _expand_acronyms_to_arpa(m, add_spaces=True):
	acronym = m.group(0)

	# remove dots if they exist
	acronym = re.sub('\.', '', acronym)

	acronym = "".join(acronym.split())
	arpabet = cmudict.lookup(acronym)

	if arpabet is None:
	acronym = list(acronym)
	arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
	# temporary fix
	if arpabet[-1] == '{Z}' and len(arpabet) > 1:
	arpabet[-2] = arpabet[-2][:-1] + ' ' + arpabet[-1][1:]
	del arpabet[-1]

	arpabet = ' '.join(arpabet)
	elif len(arpabet) == 1:
	arpabet = "{" + arpabet[0] + "}"
	else:
	arpabet = acronym

	return arpabet


	def normalize_acronyms(text):
	text = re.sub(_acronym_re, _expand_acronyms_to_arpa, text)
	return text


	def expand_acronyms(m):
	text = m.group(1)
	if text in acronyms_exceptions:
	text = acronyms_exceptions[text]
	elif text in uppercase_whiteliset:
	text = text
	else:
	text = '.'.join(text) + '.'

	if 's' in m.group(0):
	text = text + '\'s'

	if text[-1] != '.' and m.group(0)[-1] == '.':
	return text + '.'
	else:
	return text


	def spell_acronyms(text):
	text = re.sub(_non_uppercase_re, lambda m: non_uppercase_exceptions[m.group(0).lower()], text)
	text = re.sub(_acronym_re, expand_acronyms, text)
	return text