|
import argparse
|
|
import yaml
|
|
from datasets import load_dataset
|
|
|
|
|
|
def load_config(config_path):
|
|
with open(config_path, 'r') as file:
|
|
return yaml.safe_load(file)
|
|
|
|
|
|
def download_huggingface_dataset(config):
|
|
|
|
dataset_name = config['dataset_name']
|
|
local_dir = config['local_dir']
|
|
|
|
|
|
user_name, model_hub_name = dataset_name.split('/')
|
|
|
|
|
|
ds = load_dataset(dataset_name, cache_dir=local_dir)
|
|
|
|
|
|
print(f"User Name: {user_name}")
|
|
print(f"Model Hub Name: {model_hub_name}")
|
|
print(f"Dataset saved to: {local_dir}")
|
|
print(f"Dataset info: {ds}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser(description="Download dataset from Hugging Face")
|
|
parser.add_argument('--config_path',
|
|
type=str,
|
|
default='configs/datasets_info.yaml',
|
|
help='Path to the dataset configuration YAML file')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
configs = load_config(args.config_path)
|
|
|
|
|
|
for config in configs:
|
|
|
|
if config['platform'] == 'HuggingFace':
|
|
download_huggingface_dataset(config)
|
|
else:
|
|
print(f"Unsupported platform: {config['platform']}") |