""" Preprocessing utilities for polymer classification app. Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment. """ import numpy as np from scipy.interpolate import interp1d from scipy.signal import savgol_filter from sklearn.preprocessing import minmax_scale # Default resample target TARGET_LENGTH = 500 def remove_baseline(y): """Simple baseline correction using polynomial fitting (order 2)""" x = np.arange(len(y)) coeffs = np.polyfit(x, y, deg=2) baseline = np.polyval(coeffs, x) return y - baseline def normalize_spectrum(y): """Min-max normalization to [0, 1]""" return minmax_scale(y) def smooth_spectrum(y, window_length=11, polyorder=2): """Apply Savitzky-Golay smoothing.""" if len(y) < window_length: window_length = len(y) if len(y) % 2 == 1 else len(y) - 1 if window_length < 3: return y return savgol_filter(y, window_length, polyorder) def resample_spectrum(x, y, target_len=TARGET_LENGTH): """ Resample a spectrum to a fixed number of points using linear interpolation. Args: x (array-like): Wavenumber values y (array-like): Intensity values target_len (int): Target number of points Returns: np.ndarray: Resampled intensity values """ # Ensure inputs are numpy arrays x = np.asarray(x) y = np.asarray(y) # Check for valid input if len(x) != len(y): raise ValueError(f"x and y must have same length: {len(x)} vs {len(y)}") if len(x) < 2: raise ValueError("Need at least 2 points for interpolation") # Sort by x values to ensure monotonic order sort_idx = np.argsort(x) x_sorted = x[sort_idx] y_sorted = y[sort_idx] # Check for duplicate x values if len(np.unique(x_sorted)) != len(x_sorted): # Remove duplicates by averaging y values for same x x_unique, inverse_indices = np.unique(x_sorted, return_inverse=True) y_unique = np.zeros_like(x_unique, dtype=float) for i in range(len(x_unique)): mask = inverse_indices == i y_unique[i] = np.mean(y_sorted[mask]) x_sorted, y_sorted = x_unique, y_unique # Create interpolation function f_interp = interp1d(x_sorted, y_sorted, kind='linear', bounds_error=False, fill_value=np.nan) # Generate uniform grid x_uniform = np.linspace(min(x_sorted), max(x_sorted), target_len) y_uniform = f_interp(x_uniform) return y_uniform def preprocess_spectrum(x, y, target_len=500, baseline_correction=False, apply_smoothing=False, normalize=False): """ Complete preprocessing pipeline for a single spectrum. Args: x (array-like): Wavenumber values y (array-like): Intensity values target_len (int): Number of points to resample to baseline_correction (bool): Whether to apply baseline removal apply_smoothing (bool): Whether to apply Savitzky-Golay smoothing normalize (bool): Whether to apply min-max normalization Returns: np.ndarray: Preprocessed spectrum """ # Resample first y_processed = resample_spectrum(x, y, target_len=target_len) # Optional preprocessing steps if baseline_correction: y_processed = remove_baseline(y_processed) if apply_smoothing: y_processed = smooth_spectrum(y_processed) if normalize: y_processed = normalize_spectrum(y_processed) return y_processed