File size: 4,788 Bytes
103c8f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import json
from pathlib import Path
import librosa
import taglib
from tqdm import tqdm
import logging
import soundfile as sf

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MusicDataPreprocessor:
    def __init__(self, input_dir: str, output_dir: str):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)
        self.metadata = []
        
        # Create necessary directories
        self.output_dir.mkdir(parents=True, exist_ok=True)
        (self.output_dir / "audio").mkdir(exist_ok=True)
        (self.output_dir / "metadata").mkdir(exist_ok=True)
    
    def extract_metadata(self, audio_path: Path) -> dict:
        """Extract metadata from audio file (MP3 or WAV)"""
        try:
            # Read audio file metadata
            audio_format = audio_path.suffix.lower()[1:]  # Get extension without dot
            audio_file = taglib.File(str(audio_path))
            
            # Get basic audio properties
            y, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
            duration = librosa.get_duration(y=y, sr=sr)
            
            metadata = {
                "filename": audio_path.name,
                "format": audio_format,
                "duration": duration,
                "genre": audio_file.tags.get("GENRE", ["unknown"])[0],
                "title": audio_file.tags.get("TITLE", ["unknown"])[0],
                "artist": audio_file.tags.get("ARTIST", ["unknown"])[0],
                "sample_rate": sr,
                "channels": audio_file.channels
            }
            
            return metadata
            
        except Exception as e:
            logger.error(f"Error processing {audio_path}: {str(e)}")
            return None
    
    def process_files(self):
        """Process all audio files (MP3 and WAV) in the input directory"""
        # Find all MP3 and WAV files
        audio_files = list(self.input_dir.glob("**/*.[mw][pa][3v]"))  # Match mp3, wav files
        
        formats_found = {"mp3": 0, "wav": 0, "other": 0}
        formats_processed = {"mp3": 0, "wav": 0}
        
        logger.info(f"Found {len(audio_files)} audio files to process")
        
        for audio_path in tqdm(audio_files, desc="Processing audio files"):
            # Track format statistics
            file_ext = audio_path.suffix.lower()[1:]
            if file_ext == "mp3":
                formats_found["mp3"] += 1
            elif file_ext == "wav":
                formats_found["wav"] += 1
            else:
                formats_found["other"] += 1
                logger.warning(f"Unexpected file format: {file_ext} for file {audio_path}")
            
            metadata = self.extract_metadata(audio_path)
            
            if metadata:
                # Save processed audio - convert all to WAV
                output_audio_path = self.output_dir / "audio" / f"{audio_path.stem}.wav"
                try:
                    y, sr = librosa.load(audio_path, sr=16000, mono=True)
                    sf.write(output_audio_path, y, sr, format='WAV')
                    
                    # Track successful processing
                    formats_processed[file_ext] += 1
                    
                    # Add path information to metadata
                    metadata["processed_path"] = str(output_audio_path.relative_to(self.output_dir))
                    self.metadata.append(metadata)
                    
                except Exception as e:
                    logger.error(f"Error saving {audio_path}: {str(e)}")
                    continue
        
        # Save metadata
        with open(self.output_dir / "metadata" / "dataset_info.json", "w") as f:
            json.dump({
                "files": self.metadata,
                "stats": {
                    "total_processed": len(self.metadata),
                    "formats_found": formats_found,
                    "formats_processed": formats_processed
                }
            }, f, indent=2)
        
        logger.info(f"Processed {len(self.metadata)} files successfully")
        logger.info(f"Files found: MP3: {formats_found['mp3']}, WAV: {formats_found['wav']}")
        logger.info(f"Files processed: MP3: {formats_processed['mp3']}, WAV: {formats_processed['wav']}")

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_dir", type=str, required=True, help="Directory containing music files")
    parser.add_argument("--output_dir", type=str, required=True, help="Directory to save processed files")
    args = parser.parse_args()
    
    preprocessor = MusicDataPreprocessor(args.input_dir, args.output_dir)
    preprocessor.process_files()