File size: 2,001 Bytes

fe62fb4
bdb4f02
a1338da
 
dafcadc
c7362aa
02bf1ff
 
bdb4f02
dafcadc
02bf1ff
c7362aa
bdb4f02
 
 
dafcadc
c7362aa
02bf1ff
dafcadc
02bf1ff
bdb4f02
 
 
 
dafcadc
c7362aa
02bf1ff
bdb4f02
02bf1ff
c7362aa
02bf1ff
a1338da
bc08da5
 
a1338da
780c8d5
c7362aa
bdb4f02
 
780c8d5

import numpy as np
import soundfile
import msinference  # Prefer live_demo.py instead as this demo.py has no split to sentences to prevent OOM
from audiocraft.builders import AudioGen  # fixed bug for repeated calls

def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
              voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1
              soundscape = 'birds fomig'):         # purposeful spells for AudioGen (behaves as controllable top-p)

    if ('en_US/' in voice) or ('en_UK/' in voice):

        style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text, style_vector)

    elif '_' in  voice:

        style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
                                                '/', '_').replace('#', '_').replace(
                                                    'cmu-arctic', 'cmu_arctic').replace(
                                                        '_low', '') + '.wav')

        x = msinference.inference(text, style_vector)

    else:

        x = msinference.foreign(text=text, lang=voice)

    x /= 1.02 * np.abs(x).max() + 1e-7  # volume amplify to [-1,1]
    if soundscape is not None:
        sound_gen = AudioGen().to('cuda:0').eval()
        background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74,  # sound duration in seconds
                                              ).detach().cpu().numpy()
        x = .6 * x + .4 * background[:len(x)]
    return x

soundfile.write(f'demo.wav', tts_entry(), 16000)