import os import numpy as np from sklearn.decomposition import PCA import joblib # ✅ import directly from sklearn.preprocessing import StandardScaler # Directory containing your .npz files data_dir = "./train_data" # change this to your directory path from tqdm.notebook import tqdm # Collect all arrays from .npz files data_list = [] for file in tqdm(os.listdir(data_dir)): if file.endswith(".npz"): hsi_path = os.path.join(data_dir, file) with np.load(hsi_path) as npz: arr = np.ma.MaskedArray(**npz) data_list.append(arr.reshape(150, -1).transpose()) # remove masked values # Stack all into a single dataset x = np.vstack(data_list) print("\n\n") print(x.shape) # Fit PCA # Apply standard scaling scaler = StandardScaler() X_scaled = scaler.fit_transform(x) # Fit PCA pca = PCA(n_components=16) # change number of components as needed pca.fit(X_scaled) # Save both scaler and PCA model joblib.dump({"scaler": scaler, "pca": pca}, "pca_pipeline.pkl")