--- license: apache-2.0 tags: - audio-classification - deep-speech-detection - tensorflow - keras --- # Model Card for Deep Speech Detection ## Model Description This is a TensorFlow/Keras CNN model trained to detect deepfake or synthetic speech with >95% accuracy. It uses audio features (MFCCs, chroma, spectral centroid, etc.) extracted with `librosa`. ## Intended Use - Deepfake speech detection - Audio authenticity verification ## Dependencies ```bash pip install tensorflow==2.10.0 librosa==0.10.1 joblib==1.3.2 numpy==1.22.4 pandas==1.5.3 scikit-learn==1.2.2 ``` ## Usage ```python import tensorflow as tf import librosa import joblib import numpy as np import pandas as pd from huggingface_hub import hf_hub_download, HfApi import os # Download model and files repo_name = "Prince53/deep-speech-detection" model_dir = "downloaded_model" scaler_path = hf_hub_download(repo_name, "scaler.pkl", local_dir=model_dir) label_encoder_path = hf_hub_download(repo_name, "label_encoder.pkl", local_dir=model_dir) api = HfApi() api.snapshot_download(repo_name, local_dir=model_dir, allow_patterns="saved_model/*") # Load model and preprocessing objects model = tf.keras.models.load_model(os.path.join(model_dir, "saved_model")) scaler = joblib.load(scaler_path) label_encoder = joblib.load(label_encoder_path) # Feature extraction function def segment_and_extract_features(audio, sr=16000): segment_samples = int(2.0 * sr) step_samples = int(0.25 * sr) segments = [audio[i:i+segment_samples] for i in range(0, len(audio) - segment_samples + 1, step_samples)] features = [] for segment in segments: if len(segment) < segment_samples: continue mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13) chroma = librosa.feature.chroma_stft(y=segment, sr=sr) spectral_centroid = librosa.feature.spectral_centroid(y=segment, sr=sr) spectral_bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=sr) rolloff = librosa.feature.spectral_rolloff(y=segment, sr=sr) zero_crossing_rate = librosa.feature.zero_crossing_rate(y=segment) feature_dict = { 'mfcc_mean': np.mean(mfccs, axis=1), 'mfcc_std': np.std(mfccs, axis=1), 'chroma': np.mean(chroma, axis=1), 'spectral_centroid': np.mean(spectral_centroid), 'spectral_bandwidth': np.mean(spectral_bandwidth), 'rolloff': np.mean(rolloff), 'zero_crossing_rate': np.mean(zero_crossing_rate) } features.append(feature_dict) return features # Classify audio audio, sr = librosa.load("path/to/audio.wav", sr=16000) segments = segment_and_extract_features(audio, sr) segment_features = pd.concat([ pd.DataFrame([seg['mfcc_mean'] for seg in segments]), pd.DataFrame([seg['mfcc_std'] for seg in segments]), pd.DataFrame([seg['chroma'] for seg in segments]), pd.DataFrame([[seg['spectral_centroid'], seg['spectral_bandwidth'], seg['rolloff'], seg['zero_crossing_rate']] for seg in segments]) ], axis=1) segment_features = scaler.transform(segment_features) segment_features = segment_features.reshape(segment_features.shape[0], segment_features.shape[1], 1) predictions = model.predict(segment_features) segment_labels = np.argmax(predictions, axis=1) confidence_scores = np.mean(predictions, axis=0) final_label = label_encoder.inverse_transform([np.argmax(np.bincount(segment_labels))])[0] print(f"Confidence Scores: Real={confidence_scores[0]:.4f}, Fake={confidence_scores[1]:.4f}") print(f"Classification: {final_label} ({0 if final_label == 'Real' else 1})") ``` ## Limitations - Requires mono audio at 16kHz sampling rate. - May struggle with low-quality audio or unseen domains. - Trained on the Comb4 dataset. ## Training Data - Dataset: Comb4 (custom dataset with real and fake audio) - Size: [Update with number of samples] ## Evaluation - Test Accuracy: [Update with >95%]