File size: 4,780 Bytes
3914b35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
from functools import lru_cache
from typing import Any, List, Optional
import numpy
import scipy
from numpy.typing import NDArray
from facefusion.ffmpeg import read_audio_buffer
from facefusion.filesystem import is_audio
from facefusion.types import Audio, AudioFrame, Fps, Mel, MelFilterBank, Spectrogram
from facefusion.voice_extractor import batch_extract_voice
@lru_cache()
def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
return read_audio(audio_path, fps)
def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
audio_sample_rate = 48000
audio_sample_size = 16
audio_channel_total = 2
if is_audio(audio_path):
audio_buffer = read_audio_buffer(audio_path, audio_sample_rate, audio_sample_size, audio_channel_total)
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
audio = prepare_audio(audio)
spectrogram = create_spectrogram(audio)
audio_frames = extract_audio_frames(spectrogram, fps)
return audio_frames
return None
@lru_cache()
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
return read_voice(audio_path, fps)
def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
voice_sample_rate = 48000
voice_sample_size = 16
voice_channel_total = 2
voice_chunk_size = 240 * 1024
voice_step_size = 180 * 1024
if is_audio(audio_path):
audio_buffer = read_audio_buffer(audio_path, voice_sample_rate, voice_sample_size, voice_channel_total)
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
audio = batch_extract_voice(audio, voice_chunk_size, voice_step_size)
audio = prepare_voice(audio)
spectrogram = create_spectrogram(audio)
audio_frames = extract_audio_frames(spectrogram, fps)
return audio_frames
return None
def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
if is_audio(audio_path):
audio_frames = read_static_audio(audio_path, fps)
if frame_number in range(len(audio_frames)):
return audio_frames[frame_number]
return None
def extract_audio_frames(spectrogram: Spectrogram, fps: Fps) -> List[AudioFrame]:
audio_frames = []
mel_filter_total = 80
audio_step_size = 16
indices = numpy.arange(0, spectrogram.shape[1], mel_filter_total / fps).astype(numpy.int16)
indices = indices[indices >= audio_step_size]
for index in indices:
start = max(0, index - audio_step_size)
audio_frames.append(spectrogram[:, start:index])
return audio_frames
def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
if is_audio(audio_path):
voice_frames = read_static_voice(audio_path, fps)
if frame_number in range(len(voice_frames)):
return voice_frames[frame_number]
return None
def create_empty_audio_frame() -> AudioFrame:
mel_filter_total = 80
audio_step_size = 16
audio_frame = numpy.zeros((mel_filter_total, audio_step_size)).astype(numpy.int16)
return audio_frame
def prepare_audio(audio : Audio) -> Audio:
if audio.ndim > 1:
audio = numpy.mean(audio, axis = 1)
audio = audio / numpy.max(numpy.abs(audio), axis = 0)
audio = scipy.signal.lfilter([ 1.0, -0.97 ], [ 1.0 ], audio)
return audio
def prepare_voice(audio : Audio) -> Audio:
audio_sample_rate = 48000
audio_resample_rate = 16000
audio_resample_factor = round(len(audio) * audio_resample_rate / audio_sample_rate)
audio = scipy.signal.resample(audio, audio_resample_factor)
audio = prepare_audio(audio)
return audio
def convert_hertz_to_mel(hertz : float) -> float:
return 2595 * numpy.log10(1 + hertz / 700)
def convert_mel_to_hertz(mel : Mel) -> NDArray[Any]:
return 700 * (10 ** (mel / 2595) - 1)
def create_mel_filter_bank() -> MelFilterBank:
audio_sample_rate = 16000
audio_min_frequency = 55.0
audio_max_frequency = 7600.0
mel_filter_total = 80
mel_bin_total = 800
mel_filter_bank = numpy.zeros((mel_filter_total, mel_bin_total // 2 + 1))
mel_frequency_range = numpy.linspace(convert_hertz_to_mel(audio_min_frequency), convert_hertz_to_mel(audio_max_frequency), mel_filter_total + 2)
indices = numpy.floor((mel_bin_total + 1) * convert_mel_to_hertz(mel_frequency_range) / audio_sample_rate).astype(numpy.int16)
for index in range(mel_filter_total):
start = indices[index]
end = indices[index + 1]
mel_filter_bank[index, start:end] = scipy.signal.windows.triang(end - start)
return mel_filter_bank
def create_spectrogram(audio : Audio) -> Spectrogram:
mel_bin_total = 800
mel_bin_overlap = 600
mel_filter_bank = create_mel_filter_bank()
spectrogram = scipy.signal.stft(audio, nperseg = mel_bin_total, nfft = mel_bin_total, noverlap = mel_bin_overlap)[2]
spectrogram = numpy.dot(mel_filter_bank, numpy.abs(spectrogram))
return spectrogram
|