File size: 2,001 Bytes
fe62fb4 bdb4f02 a1338da dafcadc c7362aa 02bf1ff bdb4f02 dafcadc 02bf1ff c7362aa bdb4f02 dafcadc c7362aa 02bf1ff dafcadc 02bf1ff bdb4f02 dafcadc c7362aa 02bf1ff bdb4f02 02bf1ff c7362aa 02bf1ff a1338da bc08da5 a1338da 780c8d5 c7362aa bdb4f02 780c8d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import numpy as np
import soundfile
import msinference # Prefer live_demo.py instead as this demo.py has no split to sentences to prevent OOM
from audiocraft.builders import AudioGen # fixed bug for repeated calls
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1
soundscape = 'birds fomig'): # purposeful spells for AudioGen (behaves as controllable top-p)
if ('en_US/' in voice) or ('en_UK/' in voice):
style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text, style_vector)
elif '_' in voice:
style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text, style_vector)
else:
x = msinference.foreign(text=text, lang=voice)
x /= 1.02 * np.abs(x).max() + 1e-7 # volume amplify to [-1,1]
if soundscape is not None:
sound_gen = AudioGen().to('cuda:0').eval()
background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74, # sound duration in seconds
).detach().cpu().numpy()
x = .6 * x + .4 * background[:len(x)]
return x
soundfile.write(f'demo.wav', tts_entry(), 16000)
|