File size: 3,648 Bytes
65f2520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""

Preprocessing utilities for polymer classification app.

Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment.

"""

import numpy as np
from scipy.interpolate import interp1d
from scipy.signal import savgol_filter
from sklearn.preprocessing import minmax_scale

# Default resample target
TARGET_LENGTH = 500

def remove_baseline(y):
    """Simple baseline correction using polynomial fitting (order 2)"""
    x = np.arange(len(y))
    coeffs = np.polyfit(x, y, deg=2)
    baseline = np.polyval(coeffs, x)
    return y - baseline

def normalize_spectrum(y):
    """Min-max normalization to [0, 1]"""
    return minmax_scale(y)

def smooth_spectrum(y, window_length=11, polyorder=2):
    """Apply Savitzky-Golay smoothing."""
    if len(y) < window_length:
        window_length = len(y) if len(y) % 2 == 1 else len(y) - 1
        if window_length < 3:
            return y
    return savgol_filter(y, window_length, polyorder)

def resample_spectrum(x, y, target_len=TARGET_LENGTH):
    """

    Resample a spectrum to a fixed number of points using linear interpolation.

    

    Args:

        x (array-like): Wavenumber values

        y (array-like): Intensity values  

        target_len (int): Target number of points

        

    Returns:

        np.ndarray: Resampled intensity values

    """
    # Ensure inputs are numpy arrays
    x = np.asarray(x)
    y = np.asarray(y)

    # Check for valid input
    if len(x) != len(y):
        raise ValueError(f"x and y must have same length: {len(x)} vs {len(y)}")

    if len(x) < 2:
        raise ValueError("Need at least 2 points for interpolation")

    # Sort by x values to ensure monotonic order
    sort_idx = np.argsort(x)
    x_sorted = x[sort_idx]
    y_sorted = y[sort_idx]

    # Check for duplicate x values
    if len(np.unique(x_sorted)) != len(x_sorted):
        # Remove duplicates by averaging y values for same x
        x_unique, inverse_indices = np.unique(x_sorted, return_inverse=True)
        y_unique = np.zeros_like(x_unique, dtype=float)
        for i in range(len(x_unique)):
            mask = inverse_indices == i
            y_unique[i] = np.mean(y_sorted[mask])
        x_sorted, y_sorted = x_unique, y_unique

    # Create interpolation function
    f_interp = interp1d(x_sorted, y_sorted, kind='linear', bounds_error=False, fill_value=np.nan)

    # Generate uniform grid
    x_uniform = np.linspace(min(x_sorted), max(x_sorted), target_len)
    y_uniform = f_interp(x_uniform)

    return y_uniform

def preprocess_spectrum(x, y, target_len=500, baseline_correction=False, 

                       apply_smoothing=False, normalize=False):
    """

    Complete preprocessing pipeline for a single spectrum.

    

    Args:

        x (array-like): Wavenumber values

        y (array-like): Intensity values

        target_len (int): Number of points to resample to

        baseline_correction (bool): Whether to apply baseline removal

        apply_smoothing (bool): Whether to apply Savitzky-Golay smoothing

        normalize (bool): Whether to apply min-max normalization

        

    Returns:

        np.ndarray: Preprocessed spectrum

    """
    # Resample first
    y_processed = resample_spectrum(x, y, target_len=target_len)

    # Optional preprocessing steps
    if baseline_correction:
        y_processed = remove_baseline(y_processed)

    if apply_smoothing:
        y_processed = smooth_spectrum(y_processed)

    if normalize:
        y_processed = normalize_spectrum(y_processed)

    return y_processed