| | import wget |
| | import os |
| | import multiprocessing |
| | from functools import partial |
| | import time |
| |
|
| | save_dir = "/workspace/seungheon/dataset" |
| | os.makedirs(save_dir, exist_ok=True) |
| |
|
| | urls = [] |
| | db_config = {"fma": 34, "mtg_jamendo": 134, "medleydb": 100, "moisesdb": 8, "musicnet": 21} |
| | for db_name, num_files in db_config.items(): |
| | for i in range(num_files): |
| | urls.append(f"https://huggingface.co/datasets/seungheondoh/cmd-audio-dump/resolve/main/{db_name}{i}.tar.gz") |
| | def download_and_unzip(url): |
| | |
| | filename = wget.download(url) |
| | |
| | with tarfile.open(filename, 'r:gz') as tar: |
| | tar.extractall(path=save_dir) |
| | |
| | if __name__ == "__main__": |
| | os.makedirs(save_dir, exist_ok=True) |
| | |
| | start_time = time.time() |
| | num_processes = min(multiprocessing.cpu_count(), len(urls)) |
| | with multiprocessing.Pool(processes=num_processes) as pool: |
| | pool.map(download_and_unzip, urls) |
| | |
| | end_time = time.time() |
| | elapsed = end_time - start_time |
| | print(f"\nTotal download time: {int(elapsed // 60)} minutes and {int(elapsed % 60)} seconds") |
| |
|