Spaces:

iaravagni
/

BloodGlucosePrediction

Sleeping

BloodGlucosePrediction / scripts /ml_approach.py

iaravagni

Initial commit

6e8eb41 5 months ago

6.36 kB

	import os
	import pandas as pd
	import numpy as np
	import xgboost as xgb
	from sklearn.metrics import root_mean_squared_error
	import joblib


	# Constants for window sizes
	X_WINDOW_SIZE = 52
	Y_WINDOW_SIZE = 30 // 5

	def format_dataset(df, X_window_size, y_window_size):
	"""
	Format the dataset by applying sliding window technique to the dataframe and prepare the input features and labels.
	"""
	X_list, y_list = [], []

	for patient in df['patient_id'].unique():
	df_i = df[df['patient_id'] == patient]

	# Sliding window view to generate features and labels
	X_i = np.lib.stride_tricks.sliding_window_view(df_i.values, (X_window_size, df_i.shape[1]))
	y_i = np.lib.stride_tricks.sliding_window_view(df_i.values, (y_window_size, df_i.shape[1]))

	X_i = X_i[:-y_window_size]
	y_i = y_i[X_window_size:]

	X_list.append(X_i)
	y_list.append(y_i)

	X_matrix = np.concatenate(X_list, axis=0)
	y_matrix = np.concatenate(y_list, axis=0)

	# Reshaping and cleaning up the matrices
	X_matrix = X_matrix.reshape(X_matrix.shape[0], X_matrix.shape[2], X_matrix.shape[3])
	y_matrix = y_matrix.reshape(y_matrix.shape[0], y_matrix.shape[2], y_matrix.shape[3])

	# Drop unnecessary columns (timestamp and patient_id)
	X_matrix = X_matrix[:,:,2:-1]
	y_matrix = y_matrix[:,:,2]

	# Flatten X and y for XGBoost input
	X_flat = X_matrix.reshape(X_matrix.shape[0], -1)
	y_flat = y_matrix.reshape(y_matrix.shape[0], -1)

	return X_flat, y_flat

	# Function to train the model
	def train_model(model, X_train, y_train):
	"""
	Train the given model with the training data.
	"""
	model.fit(X_train, y_train)
	return model

	# Function to evaluate the model
	def evaluate_model(y_true, y_pred, dataset_name="Validation"):
	"""
	Evaluate model performance on the provided dataset.
	"""
	rmse = root_mean_squared_error(y_true, y_pred)
	print(f'Root Mean Squared Error on {dataset_name} Data: {rmse:.4f}')

	def simple_diagonal_averaging(predictions_df, test_data, context_length, step_columns):
	"""
	Simple approach to diagonally averaging predictions by patient.
	Skips the first context_length rows and averages the rest for each timestamp.

	Args:
	predictions_df (pd.DataFrame): DataFrame with step-wise predictions
	test_data (pd.DataFrame): Original test data with patient IDs
	context_length (int): Number of context steps used in the model
	step_columns (list): List of step column names

	Returns:
	pd.DataFrame: DataFrame with averaged predictions
	"""
	# Create a new dataframe for the final results
	final_df = test_data.copy()

	# Initialize prediction column with zeros/NaN
	final_df['averaged_prediction'] = 0

	# Process each patient separately
	for patient_id in test_data['patient_id'].unique():
	# Get indices for this patient
	patient_mask = final_df['patient_id'] == patient_id
	patient_indices = final_df[patient_mask].index

	# Skip the first context_length rows for this patient
	start_idx = min(context_length, len(patient_indices))

	# For each row after the context window
	for i in range(start_idx, len(patient_indices)):
	row_idx = patient_indices[i]
	pred_row_idx = i - context_length

	# Skip if the prediction row index is negative
	if pred_row_idx < 0:
	continue

	# Get the corresponding prediction row
	if pred_row_idx < len(predictions_df):
	# Average the predictions for all steps
	avg_prediction = predictions_df.iloc[pred_row_idx][step_columns].mean()
	final_df.loc[row_idx, 'averaged_prediction'] = avg_prediction

	return final_df

	def main():
	print("Running machine_learning_approach script...")

	script_dir = os.path.dirname(os.path.abspath(__file__))
	test_file = os.path.join(script_dir, '..', 'data', 'processed', 'test_dataset.csv')
	train_file = os.path.join(script_dir, '..', 'data', 'processed', 'train_dataset.csv')
	validation_file = os.path.join(script_dir, '..', 'data', 'processed', 'validation_dataset.csv')

	# Load datasets
	df_train = pd.read_csv(train_file)
	df_validation = pd.read_csv(validation_file)
	df_test = pd.read_csv(test_file)

	# Format datasets
	X_train, y_train = format_dataset(df_train, X_WINDOW_SIZE, Y_WINDOW_SIZE)
	X_val, y_val = format_dataset(df_validation, X_WINDOW_SIZE, Y_WINDOW_SIZE)
	X_test, y_test = format_dataset(df_test, X_WINDOW_SIZE, Y_WINDOW_SIZE)

	# Initialize the model
	xgb_model = xgb.XGBRegressor(
	n_estimators=50,
	learning_rate=0.2,
	max_depth=5,
	objective='reg:squarederror',
	random_state=42
	)

	# Train model on the training dataset
	xgb_model = train_model(xgb_model, X_train, y_train)

	y_val_pred = xgb_model.predict(X_val)

	# Evaluate on the validation set
	evaluate_model(y_val, y_val_pred, "Validation")

	# Re-train on the combined training and validation dataset
	X_train_complete = np.concatenate((X_train, X_val), axis=0)
	y_train_complete = np.concatenate((y_train, y_val), axis=0)
	xgb_model = train_model(xgb_model, X_train_complete, y_train_complete)

	model_output_path = os.path.join(script_dir, '..', 'models', 'xgb_model.pkl')
	joblib.dump(xgb_model, model_output_path)

	xgb_model = joblib.load(model_output_path)

	y_test_pred = xgb_model.predict(X_test)

	# Evaluate on the test set
	evaluate_model(y_test, y_test_pred, "Test")

	output_dir = os.path.join(script_dir, '..', 'data', 'outputs', 'ml_predictions_raw.csv')

	# Save test set results
	pd.DataFrame(y_test_pred).to_csv(output_dir)

	final_results = simple_diagonal_averaging(
	pd.DataFrame(y_test_pred),
	df_test,
	X_WINDOW_SIZE,
	pd.DataFrame(y_test_pred).columns
	)

	# Save final results to CSV
	final_results_path = os.path.join(script_dir, '..', 'data', 'outputs', 'ml_predictions.csv')
	final_results.to_csv(final_results_path, index=False)

	return



	# Main entry point
	if __name__ == '__main__':
	main()