File size: 5,511 Bytes

b26156a

#!/usr/bin/env python3
"""Extract images and labels from Parquet files and save them into
subfolders by label.

Usage:
    python extract_images.py [--train] [--test] [--output OUTPUT_DIR]

Defaults:
    train: process training data (train-00000-of-00001.parquet)
    test:  process test data (test-00000-of-00001.parquet)
    output: TrainData (relative to script location)
"""
import os
import sys
import argparse
from pathlib import Path
import pyarrow.parquet as pq


def extract_images_from_parquet(parquet_path, output_dir, split_name):
    """Extract images from a Parquet file and save them into label folders."""

    print(f"Processing {parquet_path}...")
    
    # read parquet file
    try:
        table = pq.read_table(parquet_path)
        df = table.to_pandas()
    except Exception as e:
        print(f"Failed to read parquet file: {e}")
        return False
    
    print(f"Found {len(df)} images")
    
    # get unique labels
    unique_labels = sorted(df['label'].unique())
    print(f"Label classes: {unique_labels}")
    
    # create folder for each label
    for label in unique_labels:
        label_dir = output_dir / split_name / f"label_{label}"
        label_dir.mkdir(parents=True, exist_ok=True)
        print(f"Created folder: {label_dir}")
    
    # extract and save images
    success_count = 0
    error_count = 0
    
    for idx, row in df.iterrows():
        try:
            # get image data
            image_struct = row['image']
            image_bytes = image_struct['bytes']
            original_path = image_struct['path']
            label = row['label']
            
            # get file extension
            _, ext = os.path.splitext(original_path)
            if not ext:
                ext = '.jpg'  # default extension
            
            # build a new filename (preserve original base name, avoid collisions)
            base_name = os.path.splitext(os.path.basename(original_path))[0]
            filename = f"{base_name}{ext}"
            
            # ensure filename is unique
            label_dir = output_dir / split_name / f"label_{label}"
            output_path = label_dir / filename
            counter = 1
            while output_path.exists():
                filename = f"{base_name}_{counter}{ext}"
                output_path = label_dir / filename
                counter += 1
            
            # save image
            with open(output_path, 'wb') as f:
                f.write(image_bytes)
            
            success_count += 1
            if success_count % 100 == 0:
                print(f"Processed {success_count} images...")
                
        except Exception as e:
            print(f"Error processing image {idx}: {e}")
            error_count += 1
            continue
    
    print(f"Done! Success: {success_count}, Failed: {error_count}")
    
    # report counts per label
    print("\nImage count per label:")
    for label in unique_labels:
        label_dir = output_dir / split_name / f"label_{label}"
        count = len(list(label_dir.glob("*")))
        print(f"  label {label}: {count} images")
    
    return success_count > 0


def main():
    parser = argparse.ArgumentParser(description="Extract images from Parquet files and organize by label")
    parser.add_argument("--train", action="store_true", help="process training data")
    parser.add_argument("--test", action="store_true", help="process test data")
    parser.add_argument("--output", "-o", default="TrainData", help="output directory")
    
    args = parser.parse_args()
    
    # if neither train nor test specified, do both by default
    if not args.train and not args.test:
        args.train = True
        args.test = True
    
    # set paths
    script_dir = Path(__file__).parent
    yoga_data_dir = script_dir / "YogaDataSet" / "data"
    output_dir = Path(args.output)
    
    # ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"Output directory: {output_dir.absolute()}")
    
    success = True
    
    # process training data
    if args.train:
        train_parquet = yoga_data_dir / "train-00000-of-00001.parquet"
        if train_parquet.exists():
            if not extract_images_from_parquet(train_parquet, output_dir, "train"):
                success = False
        else:
            print(f"Training parquet file not found: {train_parquet}")
            success = False
    
    # process test data
    if args.test:
        test_parquet = yoga_data_dir / "test-00000-of-00001.parquet"
        if test_parquet.exists():
            if not extract_images_from_parquet(test_parquet, output_dir, "test"):
                success = False
        else:
            print(f"Test parquet file not found: {test_parquet}")
            success = False
    
    if success:
        print("\n✅ All images extracted!")
        print(f"Images saved to: {output_dir.absolute()}")
        print("Directory structure:")
        print("TrainData/")
        if args.train:
            print("├── train/")
            print("│   ├── label_0/")
            print("│   ├── label_1/")
            print("│   └── ...")
        if args.test:
            print("└── test/")
            print("    ├── label_0/")
            print("    ├── label_1/")
            print("    └── ...")
    else:
        print("\n❌ Errors occurred during extraction")
        sys.exit(1)


if __name__ == "__main__":
    main()