Spaces:
Sleeping
Sleeping
File size: 6,355 Bytes
6e8eb41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error
import joblib
# Constants for window sizes
X_WINDOW_SIZE = 52
Y_WINDOW_SIZE = 30 // 5
def format_dataset(df, X_window_size, y_window_size):
"""
Format the dataset by applying sliding window technique to the dataframe and prepare the input features and labels.
"""
X_list, y_list = [], []
for patient in df['patient_id'].unique():
df_i = df[df['patient_id'] == patient]
# Sliding window view to generate features and labels
X_i = np.lib.stride_tricks.sliding_window_view(df_i.values, (X_window_size, df_i.shape[1]))
y_i = np.lib.stride_tricks.sliding_window_view(df_i.values, (y_window_size, df_i.shape[1]))
X_i = X_i[:-y_window_size]
y_i = y_i[X_window_size:]
X_list.append(X_i)
y_list.append(y_i)
X_matrix = np.concatenate(X_list, axis=0)
y_matrix = np.concatenate(y_list, axis=0)
# Reshaping and cleaning up the matrices
X_matrix = X_matrix.reshape(X_matrix.shape[0], X_matrix.shape[2], X_matrix.shape[3])
y_matrix = y_matrix.reshape(y_matrix.shape[0], y_matrix.shape[2], y_matrix.shape[3])
# Drop unnecessary columns (timestamp and patient_id)
X_matrix = X_matrix[:,:,2:-1]
y_matrix = y_matrix[:,:,2]
# Flatten X and y for XGBoost input
X_flat = X_matrix.reshape(X_matrix.shape[0], -1)
y_flat = y_matrix.reshape(y_matrix.shape[0], -1)
return X_flat, y_flat
# Function to train the model
def train_model(model, X_train, y_train):
"""
Train the given model with the training data.
"""
model.fit(X_train, y_train)
return model
# Function to evaluate the model
def evaluate_model(y_true, y_pred, dataset_name="Validation"):
"""
Evaluate model performance on the provided dataset.
"""
rmse = root_mean_squared_error(y_true, y_pred)
print(f'Root Mean Squared Error on {dataset_name} Data: {rmse:.4f}')
def simple_diagonal_averaging(predictions_df, test_data, context_length, step_columns):
"""
Simple approach to diagonally averaging predictions by patient.
Skips the first context_length rows and averages the rest for each timestamp.
Args:
predictions_df (pd.DataFrame): DataFrame with step-wise predictions
test_data (pd.DataFrame): Original test data with patient IDs
context_length (int): Number of context steps used in the model
step_columns (list): List of step column names
Returns:
pd.DataFrame: DataFrame with averaged predictions
"""
# Create a new dataframe for the final results
final_df = test_data.copy()
# Initialize prediction column with zeros/NaN
final_df['averaged_prediction'] = 0
# Process each patient separately
for patient_id in test_data['patient_id'].unique():
# Get indices for this patient
patient_mask = final_df['patient_id'] == patient_id
patient_indices = final_df[patient_mask].index
# Skip the first context_length rows for this patient
start_idx = min(context_length, len(patient_indices))
# For each row after the context window
for i in range(start_idx, len(patient_indices)):
row_idx = patient_indices[i]
pred_row_idx = i - context_length
# Skip if the prediction row index is negative
if pred_row_idx < 0:
continue
# Get the corresponding prediction row
if pred_row_idx < len(predictions_df):
# Average the predictions for all steps
avg_prediction = predictions_df.iloc[pred_row_idx][step_columns].mean()
final_df.loc[row_idx, 'averaged_prediction'] = avg_prediction
return final_df
def main():
print("Running machine_learning_approach script...")
script_dir = os.path.dirname(os.path.abspath(__file__))
test_file = os.path.join(script_dir, '..', 'data', 'processed', 'test_dataset.csv')
train_file = os.path.join(script_dir, '..', 'data', 'processed', 'train_dataset.csv')
validation_file = os.path.join(script_dir, '..', 'data', 'processed', 'validation_dataset.csv')
# Load datasets
df_train = pd.read_csv(train_file)
df_validation = pd.read_csv(validation_file)
df_test = pd.read_csv(test_file)
# Format datasets
X_train, y_train = format_dataset(df_train, X_WINDOW_SIZE, Y_WINDOW_SIZE)
X_val, y_val = format_dataset(df_validation, X_WINDOW_SIZE, Y_WINDOW_SIZE)
X_test, y_test = format_dataset(df_test, X_WINDOW_SIZE, Y_WINDOW_SIZE)
# Initialize the model
xgb_model = xgb.XGBRegressor(
n_estimators=50,
learning_rate=0.2,
max_depth=5,
objective='reg:squarederror',
random_state=42
)
# Train model on the training dataset
xgb_model = train_model(xgb_model, X_train, y_train)
y_val_pred = xgb_model.predict(X_val)
# Evaluate on the validation set
evaluate_model(y_val, y_val_pred, "Validation")
# Re-train on the combined training and validation dataset
X_train_complete = np.concatenate((X_train, X_val), axis=0)
y_train_complete = np.concatenate((y_train, y_val), axis=0)
xgb_model = train_model(xgb_model, X_train_complete, y_train_complete)
model_output_path = os.path.join(script_dir, '..', 'models', 'xgb_model.pkl')
joblib.dump(xgb_model, model_output_path)
xgb_model = joblib.load(model_output_path)
y_test_pred = xgb_model.predict(X_test)
# Evaluate on the test set
evaluate_model(y_test, y_test_pred, "Test")
output_dir = os.path.join(script_dir, '..', 'data', 'outputs', 'ml_predictions_raw.csv')
# Save test set results
pd.DataFrame(y_test_pred).to_csv(output_dir)
final_results = simple_diagonal_averaging(
pd.DataFrame(y_test_pred),
df_test,
X_WINDOW_SIZE,
pd.DataFrame(y_test_pred).columns
)
# Save final results to CSV
final_results_path = os.path.join(script_dir, '..', 'data', 'outputs', 'ml_predictions.csv')
final_results.to_csv(final_results_path, index=False)
return
# Main entry point
if __name__ == '__main__':
main()
|