File size: 4,706 Bytes
4e4c64c d72b2c3 780c8d5 4e4c64c 02bf1ff a1338da 4e4c64c 780c8d5 4e4c64c 780c8d5 4e4c64c 02bf1ff a1338da 02bf1ff 4e4c64c 780c8d5 4e4c64c 780c8d5 4e4c64c 780c8d5 4e4c64c 780c8d5 4e4c64c 780c8d5 4e4c64c 5b7599e 780c8d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# -*- coding: utf-8 -*-
import re
import codecs
import textwrap
from num2words import num2words
# IPA Phonemizer: https://github.com/bootphon/phonemizer
import nltk
#nltk.download('punkt', download_dir='./')
#nltk.download('punkt_tab', download_dir='./')
nltk.data.path.append('.')
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
dicts = {}
for i in range(len((symbols))):
dicts[symbols[i]] = i
class TextCleaner:
def __init__(self, dummy=None):
self.word_index_dictionary = dicts
print(len(dicts))
def __call__(self, text):
indexes = []
for char in text:
try:
indexes.append(self.word_index_dictionary[char])
except KeyError:
print(text)
return indexes
def split_into_sentences(text, max_len=120):
sentences = nltk.sent_tokenize(text)
limited_sentences = [i for sent in sentences for i in textwrap.wrap(sent, width=max_len)]
return limited_sentences
def store_ssml(text=None,
voice=None):
'''create ssml:
text : list of sentences
voice: https://github.com/MycroftAI/mimic3-voices
'''
print('\n___________________________\n', len(text),
text[0], '\n___________________________________\n')
_s = '<speak>'
for short_text in text:
# 1.44) # 1.24 for bieber
rate = min(max(.87, len(short_text) / 76), 1.14)
volume = int(74 * np.random.rand() + 24)
# text = ('<speak>'
# THe other voice does not have volume
_s += f'<prosody volume=\'{volume}\'>'
_s += f'<prosody rate=\'{rate}\'>'
_s += f'<voice name=\'{voice}\'>'
_s += '<s>'
_s += short_text
_s += '</s>'
_s += '</voice>'
_s += '</prosody>'
_s += '</prosody>'
_s += '</speak>'
print(len(text), '\n\n\n\n\n\n\n', _s)
with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
f.write(_s)
def transliterate_number(number_string, lang='en'):
"""
Converts a number string to words in the specified language,
handling decimals, scientific notation, and preserving text
before and after the numeral.
"""
if lang == 'rmc-script_latin':
lang = 'sr'
exponential_pronoun = ' puta deset na stepen od '
comma = ' tačka '
elif lang == 'ron':
lang = 'ro'
exponential_pronoun = ' tízszer a erejéig '
comma = ' virgulă '
elif lang == 'hun':
lang = 'hu'
exponential_pronoun = ' tízszer a erejéig '
comma = ' virgula '
elif lang == 'deu':
exponential_pronoun = ' mal zehn hoch '
comma = ' komma '
else:
lang = lang[:2]
exponential_pronoun = ' times ten to the power of '
comma = ' point '
def replace_number(match):
prefix = match.group(1) or ""
number_part = match.group(2)
suffix = match.group(5) or ""
try:
if 'e' in number_part.lower():
base, exponent = number_part.lower().split('e')
base = float(base)
exponent = int(exponent)
words = num2words(
base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang)
elif '.' in number_part:
integer_part, decimal_part = number_part.split('.')
words = num2words(int(integer_part), lang=lang) + comma + " ".join(
[num2words(int(digit), lang=lang) for digit in decimal_part])
else:
words = num2words(int(number_part), lang=lang)
return prefix + words + suffix
except ValueError:
return match.group(0) # Return original if conversion fails
pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
return re.sub(pattern, replace_number, number_string)
def discard_leading_numeral(text):
"""Discards a leading numeral (integer or float) from a string.
Args:
text: The input string.
Returns:
The string with the leading numeral removed, or the original string
if it doesn't start with a numeral.
"""
match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text)
if match:
return text[match.end():].lstrip()
else:
return text
|