KPLabs's picture
Create pca.py
72f5e8c verified
import os
import numpy as np
from sklearn.decomposition import PCA
import joblib # ✅ import directly
from sklearn.preprocessing import StandardScaler
# Directory containing your .npz files
data_dir = "./train_data" # change this to your directory path
from tqdm.notebook import tqdm
# Collect all arrays from .npz files
data_list = []
for file in tqdm(os.listdir(data_dir)):
if file.endswith(".npz"):
hsi_path = os.path.join(data_dir, file)
with np.load(hsi_path) as npz:
arr = np.ma.MaskedArray(**npz)
data_list.append(arr.reshape(150, -1).transpose()) # remove masked values
# Stack all into a single dataset
x = np.vstack(data_list)
print("\n\n")
print(x.shape)
# Fit PCA
# Apply standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)
# Fit PCA
pca = PCA(n_components=16) # change number of components as needed
pca.fit(X_scaled)
# Save both scaler and PCA model
joblib.dump({"scaler": scaler, "pca": pca}, "pca_pipeline.pkl")