Spaces:
Build error
Build error
File size: 7,808 Bytes
86a3a86 5b4d0fb 86a3a86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
import os
import numpy as np
import librosa
import soundfile as sf
import statistics as st
from joblib import load
from pydub import AudioSegment
CLASSES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
class Features:
def __init__(self, y, sr, hop_length=5000):
"""
Initialize the class with audio signal, sr and hop_length
:param y: audio signal
:param sr: sample rate of audio signal
:param hop_length: hop_length parameter used while calculating the chroma_stft feature
"""
self.y = np.split(y, 10)
self.sr = sr
self.hop_length = hop_length
def get_mean_var(self, y):
"""
Helper function to get mean and variance of feature
:param y: audio feature
:return: mean, variance
"""
mean = y.mean()
var = y.var()
return mean, var
def zero_crossing_rate(self, y):
"""
Returns the zero-crossing rate of the audio signal
:return: mean and variance of zero-crossing rate
"""
values = librosa.feature.zero_crossing_rate(y)
return self.get_mean_var(values)
def harmonic_and_per(self, y):
"""
separates the harmonic and percussive components of the audio signal
:return: harmonic and percussive components' mean and variance
"""
y_harm, y_perc = librosa.effects.hpss(y)
harm = self.get_mean_var(y_harm)
perc = self.get_mean_var(y_perc)
return harm, perc
def tempo(self, y):
"""
Extracts the tempo (beats per minute) of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
float: The tempo of the audio signal in beats per minute.
"""
tempo = librosa.beat.tempo(y, sr=self.sr)
return tempo
def centroid(self, y):
"""
Extracts the spectral centroid of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the spectral centroid.
"""
centroid = librosa.feature.spectral_centroid(y, sr=self.sr)
return self.get_mean_var(centroid)
def mfccs(self, y):
"""
Extracts the Mel-Frequency Cepstral Coefficients (MFCCs) of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
ndarray: An array containing the mean and variance of the MFCCs.
"""
mfccs = librosa.feature.mfcc(y, sr=self.sr)
mean = mfccs.mean(axis=1)
var = mfccs.var(axis=1)
values = [[mean[i], var[i]] for i in range(mean.shape[0])]
return np.array(values).reshape(-1)
def chroma_stft(self, y):
"""
Extracts the chroma feature of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the chroma feature.
"""
chroma = librosa.feature.chroma_stft(y, sr=self.sr, hop_length=self.hop_length)
return self.get_mean_var(chroma)
def spectral_bandwidth(self, y):
"""
Extracts the spectral bandwidth of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the spectral bandwidth.
"""
spd = librosa.feature.spectral_bandwidth(y,sr=self.sr )
return self.get_mean_var(spd)
def rollof(self, y):
"""
Extracts the spectral rolloff of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the spectral rolloff.
"""
rollof = librosa.feature.spectral_rolloff(y=y, sr=self.sr)[0]
return self.get_mean_var(rollof)
def rms(self, y):
"""
Extracts the root mean square (RMS) of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the RMS.
"""
rms = librosa.feature.rms(y=y)
return self.get_mean_var(rms)
def features(self,y):
"""
Extracts various audio features from an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
ndarray: An array containing the extracted audio features.
"""
tempo = self.tempo(y)
centroid_mean, centroid_var = self.centroid(y)
chroma_mean, chroma_var = self.chroma_stft(y)
zcr_mean, zcr_var = self.zero_crossing_rate(y)
spd_mean, spd_var = self.spectral_bandwidth(y)
rollof_mean, rollof_var = self.rollof(y)
rsm_mean, rsm_var = self.rms(y)
harm, perc = self.harmonic_and_per(y)
harm_mean, harm_var = harm
perc_mean, perc_var = perc
mfccs = self.mfccs(y)
features = np.array([y.shape[0],
chroma_mean, chroma_var,
rsm_mean, rsm_var,
centroid_mean, centroid_var ,
spd_mean, spd_var,
rollof_mean, rollof_var,
zcr_mean, zcr_var,
harm_mean, harm_var,
perc_mean, perc_var ,
tempo,
],
dtype=np.float32)
features = np.concatenate([features, mfccs])
return features
def splits_3sec(self):
"""
Splits an audio signal into 3-second sub-sequences and extracts audio features from each sub-sequence.
Returns:
ndarray: An array containing the extracted audio features for each 3-second sub-sequence.
"""
features_split = []
for sub_sequence in self.y:
feature = self.features(sub_sequence)
features_split.append(feature)
features_np = np.array(features_split)
return features_np
def load_model():
path = os.path.dirname(__file__)
path_model = os.path.join(path, 'models', "model.pkl")
model = load(path_model)
return model
def predict(features):
model = load_model()
prediction = model.predict(features)
mode = st.mode(prediction)
prediction = list(map(lambda x: CLASSES[x], prediction))
return CLASSES[mode], prediction
def cuts_silence(audio):
audio_file, _ = librosa.effects.trim(audio)
return audio_file
def convert_mp3_to_wav(music_file):
name_file = "music_file.wav"
sound = AudioSegment.from_mp3(music_file)
sound.export(name_file,format="wav")
return name_file
def preprosecing(uploaded_file):
name_file = convert_mp3_to_wav(uploaded_file)
y, sr = librosa.load(name_file)
audio_file = cuts_silence(y)
audio_file = audio_file[:sr*30]
sf.write(file=name_file, data=audio_file, samplerate=sr)
file = open(name_file, 'rb')
features = Features(audio_file, sr).splits_3sec()
prediction = predict(features)
return file, prediction |