ScrapeGoat-Music-Stage1 / prepare_data.py
Nathan9's picture
Upload 8 files
103c8f5 verified
import os
import json
from pathlib import Path
import librosa
import taglib
from tqdm import tqdm
import logging
import soundfile as sf
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MusicDataPreprocessor:
def __init__(self, input_dir: str, output_dir: str):
self.input_dir = Path(input_dir)
self.output_dir = Path(output_dir)
self.metadata = []
# Create necessary directories
self.output_dir.mkdir(parents=True, exist_ok=True)
(self.output_dir / "audio").mkdir(exist_ok=True)
(self.output_dir / "metadata").mkdir(exist_ok=True)
def extract_metadata(self, audio_path: Path) -> dict:
"""Extract metadata from audio file (MP3 or WAV)"""
try:
# Read audio file metadata
audio_format = audio_path.suffix.lower()[1:] # Get extension without dot
audio_file = taglib.File(str(audio_path))
# Get basic audio properties
y, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
duration = librosa.get_duration(y=y, sr=sr)
metadata = {
"filename": audio_path.name,
"format": audio_format,
"duration": duration,
"genre": audio_file.tags.get("GENRE", ["unknown"])[0],
"title": audio_file.tags.get("TITLE", ["unknown"])[0],
"artist": audio_file.tags.get("ARTIST", ["unknown"])[0],
"sample_rate": sr,
"channels": audio_file.channels
}
return metadata
except Exception as e:
logger.error(f"Error processing {audio_path}: {str(e)}")
return None
def process_files(self):
"""Process all audio files (MP3 and WAV) in the input directory"""
# Find all MP3 and WAV files
audio_files = list(self.input_dir.glob("**/*.[mw][pa][3v]")) # Match mp3, wav files
formats_found = {"mp3": 0, "wav": 0, "other": 0}
formats_processed = {"mp3": 0, "wav": 0}
logger.info(f"Found {len(audio_files)} audio files to process")
for audio_path in tqdm(audio_files, desc="Processing audio files"):
# Track format statistics
file_ext = audio_path.suffix.lower()[1:]
if file_ext == "mp3":
formats_found["mp3"] += 1
elif file_ext == "wav":
formats_found["wav"] += 1
else:
formats_found["other"] += 1
logger.warning(f"Unexpected file format: {file_ext} for file {audio_path}")
metadata = self.extract_metadata(audio_path)
if metadata:
# Save processed audio - convert all to WAV
output_audio_path = self.output_dir / "audio" / f"{audio_path.stem}.wav"
try:
y, sr = librosa.load(audio_path, sr=16000, mono=True)
sf.write(output_audio_path, y, sr, format='WAV')
# Track successful processing
formats_processed[file_ext] += 1
# Add path information to metadata
metadata["processed_path"] = str(output_audio_path.relative_to(self.output_dir))
self.metadata.append(metadata)
except Exception as e:
logger.error(f"Error saving {audio_path}: {str(e)}")
continue
# Save metadata
with open(self.output_dir / "metadata" / "dataset_info.json", "w") as f:
json.dump({
"files": self.metadata,
"stats": {
"total_processed": len(self.metadata),
"formats_found": formats_found,
"formats_processed": formats_processed
}
}, f, indent=2)
logger.info(f"Processed {len(self.metadata)} files successfully")
logger.info(f"Files found: MP3: {formats_found['mp3']}, WAV: {formats_found['wav']}")
logger.info(f"Files processed: MP3: {formats_processed['mp3']}, WAV: {formats_processed['wav']}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--input_dir", type=str, required=True, help="Directory containing music files")
parser.add_argument("--output_dir", type=str, required=True, help="Directory to save processed files")
args = parser.parse_args()
preprocessor = MusicDataPreprocessor(args.input_dir, args.output_dir)
preprocessor.process_files()