music_recommedation / src /preprosecing.py
frorozcol
Values
5b4d0fb
import os
import numpy as np
import librosa
import soundfile as sf
import statistics as st
from joblib import load
from pydub import AudioSegment
CLASSES = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
class Features:
def __init__(self, y, sr, hop_length=5000):
"""
Initialize the class with audio signal, sr and hop_length
:param y: audio signal
:param sr: sample rate of audio signal
:param hop_length: hop_length parameter used while calculating the chroma_stft feature
"""
self.y = np.split(y, 10)
self.sr = sr
self.hop_length = hop_length
def get_mean_var(self, y):
"""
Helper function to get mean and variance of feature
:param y: audio feature
:return: mean, variance
"""
mean = y.mean()
var = y.var()
return mean, var
def zero_crossing_rate(self, y):
"""
Returns the zero-crossing rate of the audio signal
:return: mean and variance of zero-crossing rate
"""
values = librosa.feature.zero_crossing_rate(y)
return self.get_mean_var(values)
def harmonic_and_per(self, y):
"""
separates the harmonic and percussive components of the audio signal
:return: harmonic and percussive components' mean and variance
"""
y_harm, y_perc = librosa.effects.hpss(y)
harm = self.get_mean_var(y_harm)
perc = self.get_mean_var(y_perc)
return harm, perc
def tempo(self, y):
"""
Extracts the tempo (beats per minute) of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
float: The tempo of the audio signal in beats per minute.
"""
tempo = librosa.beat.tempo(y, sr=self.sr)
return tempo
def centroid(self, y):
"""
Extracts the spectral centroid of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the spectral centroid.
"""
centroid = librosa.feature.spectral_centroid(y, sr=self.sr)
return self.get_mean_var(centroid)
def mfccs(self, y):
"""
Extracts the Mel-Frequency Cepstral Coefficients (MFCCs) of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
ndarray: An array containing the mean and variance of the MFCCs.
"""
mfccs = librosa.feature.mfcc(y, sr=self.sr)
mean = mfccs.mean(axis=1)
var = mfccs.var(axis=1)
values = [[mean[i], var[i]] for i in range(mean.shape[0])]
return np.array(values).reshape(-1)
def chroma_stft(self, y):
"""
Extracts the chroma feature of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the chroma feature.
"""
chroma = librosa.feature.chroma_stft(y, sr=self.sr, hop_length=self.hop_length)
return self.get_mean_var(chroma)
def spectral_bandwidth(self, y):
"""
Extracts the spectral bandwidth of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the spectral bandwidth.
"""
spd = librosa.feature.spectral_bandwidth(y,sr=self.sr )
return self.get_mean_var(spd)
def rollof(self, y):
"""
Extracts the spectral rolloff of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the spectral rolloff.
"""
rollof = librosa.feature.spectral_rolloff(y=y, sr=self.sr)[0]
return self.get_mean_var(rollof)
def rms(self, y):
"""
Extracts the root mean square (RMS) of an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
tuple: A tuple containing the mean and variance of the RMS.
"""
rms = librosa.feature.rms(y=y)
return self.get_mean_var(rms)
def features(self,y):
"""
Extracts various audio features from an audio signal.
Parameters:
y (ndarray): The audio signal represented as an numpy array.
Returns:
ndarray: An array containing the extracted audio features.
"""
tempo = self.tempo(y)
centroid_mean, centroid_var = self.centroid(y)
chroma_mean, chroma_var = self.chroma_stft(y)
zcr_mean, zcr_var = self.zero_crossing_rate(y)
spd_mean, spd_var = self.spectral_bandwidth(y)
rollof_mean, rollof_var = self.rollof(y)
rsm_mean, rsm_var = self.rms(y)
harm, perc = self.harmonic_and_per(y)
harm_mean, harm_var = harm
perc_mean, perc_var = perc
mfccs = self.mfccs(y)
features = np.array([y.shape[0],
chroma_mean, chroma_var,
rsm_mean, rsm_var,
centroid_mean, centroid_var ,
spd_mean, spd_var,
rollof_mean, rollof_var,
zcr_mean, zcr_var,
harm_mean, harm_var,
perc_mean, perc_var ,
tempo,
],
dtype=np.float32)
features = np.concatenate([features, mfccs])
return features
def splits_3sec(self):
"""
Splits an audio signal into 3-second sub-sequences and extracts audio features from each sub-sequence.
Returns:
ndarray: An array containing the extracted audio features for each 3-second sub-sequence.
"""
features_split = []
for sub_sequence in self.y:
feature = self.features(sub_sequence)
features_split.append(feature)
features_np = np.array(features_split)
return features_np
def load_model():
path = os.path.dirname(__file__)
path_model = os.path.join(path, 'models', "model.pkl")
model = load(path_model)
return model
def predict(features):
model = load_model()
prediction = model.predict(features)
mode = st.mode(prediction)
prediction = list(map(lambda x: CLASSES[x], prediction))
return CLASSES[mode], prediction
def cuts_silence(audio):
audio_file, _ = librosa.effects.trim(audio)
return audio_file
def convert_mp3_to_wav(music_file):
name_file = "music_file.wav"
sound = AudioSegment.from_mp3(music_file)
sound.export(name_file,format="wav")
return name_file
def preprosecing(uploaded_file):
name_file = convert_mp3_to_wav(uploaded_file)
y, sr = librosa.load(name_file)
audio_file = cuts_silence(y)
audio_file = audio_file[:sr*30]
sf.write(file=name_file, data=audio_file, samplerate=sr)
file = open(name_file, 'rb')
features = Features(audio_file, sr).splits_3sec()
prediction = predict(features)
return file, prediction