File size: 2,667 Bytes
8146713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re
from . import cmudict

_letter_to_arpabet = {
    'A': 'EY1',
    'B': 'B IY1',
    'C': 'S IY1',
    'D': 'D IY1',
    'E': 'IY1',
    'F': 'EH1 F',
    'G': 'JH IY1',
    'H': 'EY1 CH',
    'I': 'AY1',
    'J': 'JH EY1',
    'K': 'K EY1',
    'L': 'EH1 L',
    'M': 'EH1 M',
    'N': 'EH1 N',
    'O': 'OW1',
    'P': 'P IY1',
    'Q': 'K Y UW1',
    'R': 'AA1 R',
    'S': 'EH1 S',
    'T': 'T IY1',
    'U': 'Y UW1',
    'V': 'V IY1',
    'X': 'EH1 K S',
    'Y': 'W AY1',
    'W': 'D AH1 B AH0 L Y UW0',
    'Z': 'Z IY1',
    's': 'Z'
}

# Acronyms that should not be expanded
hardcoded_acronyms = [
    'BMW', 'MVD', 'WDSU', 'GOP', 'UK', 'AI', 'GPS', 'BP', 'FBI', 'HD',
    'CES', 'LRA', 'PC', 'NBA', 'BBL', 'OS', 'IRS', 'SAC', 'UV', 'CEO', 'TV',
    'CNN', 'MSS', 'GSA', 'USSR', 'DNA', 'PRS', 'TSA', 'US', 'GPU', 'USA',
    'FPCC', 'CIA']

# Words and acronyms that should be read as regular words, e.g., NATO, HAPPY, etc.
uppercase_whiteliset = []

acronyms_exceptions = {
    'NVIDIA': 'N.VIDIA',
}

non_uppercase_exceptions = {
    'email': 'e-mail',
}

# must ignore roman numerals
_acronym_re = re.compile(r'([a-z]*[A-Z][A-Z]+)s?\.?')
_non_uppercase_re = re.compile(r'\b({})\b'.format('|'.join(non_uppercase_exceptions.keys())), re.IGNORECASE)


def _expand_acronyms_to_arpa(m, add_spaces=True):
    acronym = m.group(0)

    # remove dots if they exist
    acronym = re.sub('\.', '', acronym)

    acronym = "".join(acronym.split())
    arpabet = cmudict.lookup(acronym)

    if arpabet is None:
        acronym = list(acronym)
        arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
        # temporary fix
        if arpabet[-1] == '{Z}' and len(arpabet) > 1:
            arpabet[-2] = arpabet[-2][:-1] + ' ' + arpabet[-1][1:]
            del arpabet[-1]

        arpabet = ' '.join(arpabet)
    elif len(arpabet) == 1:
        arpabet = "{" + arpabet[0] + "}"
    else:
        arpabet = acronym

    return arpabet


def normalize_acronyms(text):
    text = re.sub(_acronym_re, _expand_acronyms_to_arpa, text)
    return text


def expand_acronyms(m):
    text = m.group(1)
    if text in acronyms_exceptions:
        text = acronyms_exceptions[text]
    elif text in uppercase_whiteliset:
        text = text
    else:
        text = '.'.join(text) + '.'

    if 's' in m.group(0):
        text = text + '\'s'

    if text[-1] != '.' and m.group(0)[-1] == '.':
        return text + '.'
    else:
        return text


def spell_acronyms(text):
    text = re.sub(_non_uppercase_re, lambda m: non_uppercase_exceptions[m.group(0).lower()], text)
    text = re.sub(_acronym_re, expand_acronyms, text)
    return text