File size: 4,706 Bytes

4e4c64c
 
 
d72b2c3
780c8d5
4e4c64c
02bf1ff
a1338da
 
 
 
4e4c64c
 
 
 
 
 
 
 
 
 
 
 
780c8d5
4e4c64c
 
 
 
780c8d5
4e4c64c
 
 
 
 
 
 
 
 
 
02bf1ff
 
a1338da
02bf1ff
4e4c64c
780c8d5
4e4c64c
 
 
 
 
 
780c8d5
 
4e4c64c
 
 
780c8d5
 
 
4e4c64c
 
780c8d5
 
4e4c64c
 
 
 
 
 
 
 
 
780c8d5
 
4e4c64c
5b7599e
780c8d5

# -*- coding: utf-8 -*-
import re
import codecs
import textwrap
from num2words import num2words
# IPA Phonemizer: https://github.com/bootphon/phonemizer
import nltk
#nltk.download('punkt', download_dir='./')
#nltk.download('punkt_tab', download_dir='./')
nltk.data.path.append('.')

_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)

dicts = {}
for i in range(len((symbols))):
    dicts[symbols[i]] = i


class TextCleaner:
    def __init__(self, dummy=None):
        self.word_index_dictionary = dicts
        print(len(dicts))

    def __call__(self, text):
        indexes = []
        for char in text:
            try:
                indexes.append(self.word_index_dictionary[char])
            except KeyError:
                print(text)
        return indexes


def split_into_sentences(text, max_len=120):
    sentences = nltk.sent_tokenize(text)
    limited_sentences = [i for sent in sentences for i in textwrap.wrap(sent, width=max_len)]
    return limited_sentences


def store_ssml(text=None,
               voice=None):
    '''create ssml:
           text : list of sentences
           voice: https://github.com/MycroftAI/mimic3-voices
    '''
    print('\n___________________________\n', len(text),
          text[0], '\n___________________________________\n')
    _s = '<speak>'
    for short_text in text:

        # 1.44)  # 1.24 for bieber
        rate = min(max(.87, len(short_text) / 76), 1.14)

        volume = int(74 * np.random.rand() + 24)
        # text = ('<speak>'
        # THe other voice does not have volume
        _s += f'<prosody volume=\'{volume}\'>'
        _s += f'<prosody rate=\'{rate}\'>'
        _s += f'<voice name=\'{voice}\'>'
        _s += '<s>'
        _s += short_text
        _s += '</s>'
        _s += '</voice>'
        _s += '</prosody>'
        _s += '</prosody>'
    _s += '</speak>'
    print(len(text), '\n\n\n\n\n\n\n', _s)

    with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
        f.write(_s)


def transliterate_number(number_string, lang='en'):
    """
    Converts a number string to words in the specified language,
    handling decimals, scientific notation, and preserving text
    before and after the numeral.
    """

    if lang == 'rmc-script_latin':
        lang = 'sr'
        exponential_pronoun = ' puta deset na stepen od '
        comma = ' tačka '
    elif lang == 'ron':
        lang = 'ro'
        exponential_pronoun = ' tízszer a erejéig '
        comma = ' virgulă '
    elif lang == 'hun':
        lang = 'hu'
        exponential_pronoun = ' tízszer a erejéig '
        comma = ' virgula '
    elif lang == 'deu':
        exponential_pronoun = ' mal zehn hoch '
        comma = ' komma '
    else:
        lang = lang[:2]
        exponential_pronoun = ' times ten to the power of '
        comma = ' point '

    def replace_number(match):
        prefix = match.group(1) or ""
        number_part = match.group(2)
        suffix = match.group(5) or ""

        try:
            if 'e' in number_part.lower():
                base, exponent = number_part.lower().split('e')
                base = float(base)
                exponent = int(exponent)
                words = num2words(
                    base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang)
            elif '.' in number_part:
                integer_part, decimal_part = number_part.split('.')
                words = num2words(int(integer_part), lang=lang) + comma + " ".join(
                    [num2words(int(digit), lang=lang) for digit in decimal_part])
            else:
                words = num2words(int(number_part), lang=lang)
            return prefix + words + suffix
        except ValueError:
            return match.group(0)  # Return original if conversion fails

    pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
    return re.sub(pattern, replace_number, number_string)


def discard_leading_numeral(text):
  """Discards a leading numeral (integer or float) from a string.

  Args:
    text: The input string.

  Returns:
    The string with the leading numeral removed, or the original string
    if it doesn't start with a numeral.
  """
  match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text)
  if match:
    return text[match.end():].lstrip()
  else:
    return text