""" This script preprocesses a dataset of spectra by resampling and labeling the data. Functions: - resample_spectrum(x, y, target_len): Resamples a spectrum to a fixed number of points. - preprocess_dataset(...): Loads, resamples, and applies optional preprocessing steps: - baseline correction - Savitzky-Golay smoothing - min-max normalization The script expects the dataset directory to contain text files representing spectra. Each file is: 1. Listed using `list_txt_files()` 2. Labeled using `label_file()` 3. Loaded using `load_spectrum()` 4. Resampled and optionally cleaned 5. Returned as arrays suitable for ML training Dependencies: - numpy - scipy.interpolate, scipy.signal - sklearn.preprocessing - list_spectra (custom) - plot_spectrum (custom) """ import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) import numpy as np from scipy.interpolate import interp1d from scipy.signal import savgol_filter from sklearn.preprocessing import minmax_scale from scripts.discover_raman_files import list_txt_files, label_file from scripts.plot_spectrum import load_spectrum # Default resample target TARGET_LENGTH = 500 # Optional preprocessing steps def remove_baseline(y): """Simple baseline correction using polynomial fitting (order 2)""" x = np.arange(len(y)) coeffs = np.polyfit(x, y, deg=2) baseline = np.polyval(coeffs, x) return y - baseline def normalize_spectrum(y): """Min-max normalization to [0, 1]""" return minmax_scale(y) def smooth_spectrum(y, window_length=11, polyorder=2): """Apply Savitzky-Golay smoothing.""" return savgol_filter(y, window_length, polyorder) def resample_spectrum(x, y, target_len=TARGET_LENGTH): """Resample a spectrum to a fixed number of points.""" f_interp = interp1d(x, y, kind='linear', fill_value='extrapolate') x_uniform = np.linspace(min(x), max(x), target_len) y_uniform = f_interp(x_uniform) return y_uniform def preprocess_dataset( dataset_dir, target_len=500, baseline_correction=False, apply_smoothing=False, normalize=False ): """ Load, resample, and preprocess all valid spectra in the dataset. Args: dataset_dir (str): Path to the dataset target_len (int): Number of points to resample to baseline_correction (bool): Whether to apply baseline removal apply_smoothing (bool): Whether to apply Savitzky-Golay smoothing normalize (bool): Whether to apply min-max normalization Returns: X (np.ndarray): Preprocessed spectra y (np.ndarray): Corresponding labels """ txt_paths = list_txt_files(dataset_dir) X, y_labels = [], [] for path in txt_paths: label = label_file(path) if label is None: continue x_raw, y_raw = load_spectrum(path) if len(x_raw) < 10: continue # Skip files with too few points # Resample y_processed = resample_spectrum(x_raw, y_raw, target_len=target_len) # Optional preprocessing if baseline_correction: y_processed = remove_baseline(y_processed) if apply_smoothing: y_processed = smooth_spectrum(y_processed) if normalize: y_processed = normalize_spectrum(y_processed) X.append(y_processed) y_labels.append(label) return np.array(X), np.array(y_labels) # Optional: Run directly for testing if __name__ == "__main__": dataset_dir = os.path.join( "datasets", "rdwp" ) X, y = preprocess_dataset(dataset_dir) print(f"X shape: {X.shape}") print(f"y shape: {y.shape}") print(f"Label distribution: {np.bincount(y)}")