File size: 6,355 Bytes
6e8eb41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error
import joblib


# Constants for window sizes
X_WINDOW_SIZE =  52
Y_WINDOW_SIZE = 30 // 5

def format_dataset(df, X_window_size, y_window_size):
    """
    Format the dataset by applying sliding window technique to the dataframe and prepare the input features and labels.
    """
    X_list, y_list = [], []
    
    for patient in df['patient_id'].unique():
        df_i = df[df['patient_id'] == patient]
        
        # Sliding window view to generate features and labels
        X_i = np.lib.stride_tricks.sliding_window_view(df_i.values, (X_window_size, df_i.shape[1]))
        y_i = np.lib.stride_tricks.sliding_window_view(df_i.values, (y_window_size, df_i.shape[1]))

        X_i = X_i[:-y_window_size]
        y_i = y_i[X_window_size:]
        
        X_list.append(X_i)
        y_list.append(y_i)

    X_matrix = np.concatenate(X_list, axis=0)  
    y_matrix = np.concatenate(y_list, axis=0)

    # Reshaping and cleaning up the matrices
    X_matrix = X_matrix.reshape(X_matrix.shape[0], X_matrix.shape[2], X_matrix.shape[3]) 
    y_matrix = y_matrix.reshape(y_matrix.shape[0], y_matrix.shape[2], y_matrix.shape[3]) 

    # Drop unnecessary columns (timestamp and patient_id)
    X_matrix = X_matrix[:,:,2:-1]
    y_matrix = y_matrix[:,:,2]

    # Flatten X and y for XGBoost input
    X_flat = X_matrix.reshape(X_matrix.shape[0], -1)
    y_flat = y_matrix.reshape(y_matrix.shape[0], -1)  

    return X_flat, y_flat

# Function to train the model
def train_model(model, X_train, y_train):
    """
    Train the given model with the training data.
    """
    model.fit(X_train, y_train)
    return model

# Function to evaluate the model
def evaluate_model(y_true, y_pred, dataset_name="Validation"):
    """
    Evaluate model performance on the provided dataset.
    """    
    rmse = root_mean_squared_error(y_true, y_pred)
    print(f'Root Mean Squared Error on {dataset_name} Data: {rmse:.4f}')

def simple_diagonal_averaging(predictions_df, test_data, context_length, step_columns):
    """
    Simple approach to diagonally averaging predictions by patient.
    Skips the first context_length rows and averages the rest for each timestamp.
    
    Args:
        predictions_df (pd.DataFrame): DataFrame with step-wise predictions
        test_data (pd.DataFrame): Original test data with patient IDs
        context_length (int): Number of context steps used in the model
        step_columns (list): List of step column names
    
    Returns:
        pd.DataFrame: DataFrame with averaged predictions
    """
    # Create a new dataframe for the final results
    final_df = test_data.copy()
    
    # Initialize prediction column with zeros/NaN
    final_df['averaged_prediction'] = 0
    
    # Process each patient separately
    for patient_id in test_data['patient_id'].unique():
        # Get indices for this patient
        patient_mask = final_df['patient_id'] == patient_id
        patient_indices = final_df[patient_mask].index
        
        # Skip the first context_length rows for this patient
        start_idx = min(context_length, len(patient_indices))
        
        # For each row after the context window
        for i in range(start_idx, len(patient_indices)):
            row_idx = patient_indices[i]
            pred_row_idx = i - context_length
            
            # Skip if the prediction row index is negative
            if pred_row_idx < 0:
                continue
                
            # Get the corresponding prediction row
            if pred_row_idx < len(predictions_df):
                # Average the predictions for all steps
                avg_prediction = predictions_df.iloc[pred_row_idx][step_columns].mean()
                final_df.loc[row_idx, 'averaged_prediction'] = avg_prediction
    
    return final_df

def main():
    print("Running machine_learning_approach script...")

    script_dir = os.path.dirname(os.path.abspath(__file__))
    test_file = os.path.join(script_dir, '..', 'data', 'processed', 'test_dataset.csv')
    train_file = os.path.join(script_dir, '..', 'data', 'processed', 'train_dataset.csv')
    validation_file = os.path.join(script_dir, '..', 'data', 'processed', 'validation_dataset.csv')

    # Load datasets
    df_train = pd.read_csv(train_file)
    df_validation = pd.read_csv(validation_file)
    df_test = pd.read_csv(test_file)

    # Format datasets
    X_train, y_train = format_dataset(df_train, X_WINDOW_SIZE, Y_WINDOW_SIZE)
    X_val, y_val = format_dataset(df_validation, X_WINDOW_SIZE, Y_WINDOW_SIZE)
    X_test, y_test = format_dataset(df_test, X_WINDOW_SIZE, Y_WINDOW_SIZE)

    # Initialize the model
    xgb_model = xgb.XGBRegressor(
        n_estimators=50, 
        learning_rate=0.2, 
        max_depth=5, 
        objective='reg:squarederror', 
        random_state=42
    )

    # Train model on the training dataset
    xgb_model = train_model(xgb_model, X_train, y_train)

    y_val_pred = xgb_model.predict(X_val)

    # Evaluate on the validation set
    evaluate_model(y_val, y_val_pred, "Validation")

    # Re-train on the combined training and validation dataset
    X_train_complete = np.concatenate((X_train, X_val), axis=0)
    y_train_complete = np.concatenate((y_train, y_val), axis=0)
    xgb_model = train_model(xgb_model, X_train_complete, y_train_complete)

    model_output_path = os.path.join(script_dir, '..', 'models', 'xgb_model.pkl')
    joblib.dump(xgb_model, model_output_path)

    xgb_model = joblib.load(model_output_path)
    
    y_test_pred = xgb_model.predict(X_test)

    # Evaluate on the test set
    evaluate_model(y_test, y_test_pred, "Test")

    output_dir = os.path.join(script_dir, '..', 'data', 'outputs', 'ml_predictions_raw.csv')

    # Save test set results
    pd.DataFrame(y_test_pred).to_csv(output_dir)

    final_results = simple_diagonal_averaging(
        pd.DataFrame(y_test_pred), 
        df_test, 
        X_WINDOW_SIZE,
        pd.DataFrame(y_test_pred).columns
    )
    
    # Save final results to CSV
    final_results_path = os.path.join(script_dir, '..', 'data', 'outputs', 'ml_predictions.csv')
    final_results.to_csv(final_results_path, index=False)
    
    return



# Main entry point
if __name__ == '__main__':
    main()