File size: 4,706 Bytes
4e4c64c
 
 
d72b2c3
780c8d5
4e4c64c
02bf1ff
a1338da
 
 
 
4e4c64c
 
 
 
 
 
 
 
 
 
 
 
780c8d5
4e4c64c
 
 
 
780c8d5
4e4c64c
 
 
 
 
 
 
 
 
 
02bf1ff
 
a1338da
02bf1ff
4e4c64c
780c8d5
4e4c64c
 
 
 
 
 
780c8d5
 
4e4c64c
 
 
780c8d5
 
 
4e4c64c
 
780c8d5
 
4e4c64c
 
 
 
 
 
 
 
 
780c8d5
 
4e4c64c
5b7599e
780c8d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
import re
import codecs
import textwrap
from num2words import num2words
# IPA Phonemizer: https://github.com/bootphon/phonemizer
import nltk
#nltk.download('punkt', download_dir='./')
#nltk.download('punkt_tab', download_dir='./')
nltk.data.path.append('.')

_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)

dicts = {}
for i in range(len((symbols))):
    dicts[symbols[i]] = i


class TextCleaner:
    def __init__(self, dummy=None):
        self.word_index_dictionary = dicts
        print(len(dicts))

    def __call__(self, text):
        indexes = []
        for char in text:
            try:
                indexes.append(self.word_index_dictionary[char])
            except KeyError:
                print(text)
        return indexes


def split_into_sentences(text, max_len=120):
    sentences = nltk.sent_tokenize(text)
    limited_sentences = [i for sent in sentences for i in textwrap.wrap(sent, width=max_len)]
    return limited_sentences


def store_ssml(text=None,
               voice=None):
    '''create ssml:
           text : list of sentences
           voice: https://github.com/MycroftAI/mimic3-voices
    '''
    print('\n___________________________\n', len(text),
          text[0], '\n___________________________________\n')
    _s = '<speak>'
    for short_text in text:

        # 1.44)  # 1.24 for bieber
        rate = min(max(.87, len(short_text) / 76), 1.14)

        volume = int(74 * np.random.rand() + 24)
        # text = ('<speak>'
        # THe other voice does not have volume
        _s += f'<prosody volume=\'{volume}\'>'
        _s += f'<prosody rate=\'{rate}\'>'
        _s += f'<voice name=\'{voice}\'>'
        _s += '<s>'
        _s += short_text
        _s += '</s>'
        _s += '</voice>'
        _s += '</prosody>'
        _s += '</prosody>'
    _s += '</speak>'
    print(len(text), '\n\n\n\n\n\n\n', _s)

    with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
        f.write(_s)


def transliterate_number(number_string, lang='en'):
    """
    Converts a number string to words in the specified language,
    handling decimals, scientific notation, and preserving text
    before and after the numeral.
    """

    if lang == 'rmc-script_latin':
        lang = 'sr'
        exponential_pronoun = ' puta deset na stepen od '
        comma = ' tačka '
    elif lang == 'ron':
        lang = 'ro'
        exponential_pronoun = ' tízszer a erejéig '
        comma = ' virgulă '
    elif lang == 'hun':
        lang = 'hu'
        exponential_pronoun = ' tízszer a erejéig '
        comma = ' virgula '
    elif lang == 'deu':
        exponential_pronoun = ' mal zehn hoch '
        comma = ' komma '
    else:
        lang = lang[:2]
        exponential_pronoun = ' times ten to the power of '
        comma = ' point '

    def replace_number(match):
        prefix = match.group(1) or ""
        number_part = match.group(2)
        suffix = match.group(5) or ""

        try:
            if 'e' in number_part.lower():
                base, exponent = number_part.lower().split('e')
                base = float(base)
                exponent = int(exponent)
                words = num2words(
                    base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang)
            elif '.' in number_part:
                integer_part, decimal_part = number_part.split('.')
                words = num2words(int(integer_part), lang=lang) + comma + " ".join(
                    [num2words(int(digit), lang=lang) for digit in decimal_part])
            else:
                words = num2words(int(number_part), lang=lang)
            return prefix + words + suffix
        except ValueError:
            return match.group(0)  # Return original if conversion fails

    pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
    return re.sub(pattern, replace_number, number_string)


def discard_leading_numeral(text):
  """Discards a leading numeral (integer or float) from a string.

  Args:
    text: The input string.

  Returns:
    The string with the leading numeral removed, or the original string
    if it doesn't start with a numeral.
  """
  match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text)
  if match:
    return text[match.end():].lstrip()
  else:
    return text