Spaces:

iaravagni
/

BloodGlucosePrediction

Sleeping

BloodGlucosePrediction / scripts /make_dataset.py

iaravagni

Initial commit

6e8eb41 5 months ago

10.4 kB

	import pandas as pd
	import numpy as np
	import json
	import os

	# Get the directory where the script is located
	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

	def clean_blood_glucose_df(bg_df):
	"""
	Filter a blood glucose dataframe to keep only rows where Event Type is 'EGV' (Estimated Glucose Value).

	Args:
	bg_df (pd.DataFrame): DataFrame containing blood glucose data

	Returns:
	pd.DataFrame: Filtered DataFrame with only EGV events
	"""
	# Filter the rows where Event Type is 'EGV'
	bg_df = bg_df[bg_df['Event Type'] == 'EGV']
	return bg_df

	def get_accelerometer_values(acc_df, time_series_df, window_size='1h'):
	"""
	Calculate accelerometer magnitude values and add them to the time series dataframe.
	Uses a weighted average where more recent values have higher weight.

	Args:
	acc_df (pd.DataFrame): DataFrame containing accelerometer data
	time_series_df (pd.DataFrame): DataFrame to add accelerometer values to
	window_size (str, optional): Time window to consider for calculations. Defaults to '1h'.

	Returns:
	pd.DataFrame: Original DataFrame with added accelerometer magnitude values
	"""
	# Calculate magnitude for accelerometer data
	acc_df['Magnitude'] = np.sqrt(acc_df[' acc_x']2 + acc_df[' acc_y']2 + acc_df[' acc_z']**2).round(2)
	acc_df['Magnitude'] = pd.to_numeric(acc_df['Magnitude'], errors='coerce')

	weighted_avgs = []
	window_timedelta = pd.Timedelta(window_size)

	for ts in time_series_df['Timestamp']:
	# Select only accelerometer data within the time window
	relevant_acc = acc_df[(acc_df['Timestamp'] >= ts - window_timedelta) & (acc_df['Timestamp'] <= ts)]

	if not relevant_acc.empty:
	# Compute weighted average: more recent values have higher weight
	time_diffs = (ts - relevant_acc['Timestamp']).dt.total_seconds()
	weights = 1 / (time_diffs + 1) # Avoid division by zero
	weighted_avg = ((relevant_acc['Magnitude'] * weights).sum() / weights.sum()).round(2)
	else:
	weighted_avg = 0

	weighted_avgs.append(weighted_avg)


	time_series_df['Accelerometer'] = weighted_avgs

	return time_series_df

	def get_food_values(food_df, time_series_df, window_size='1h'):
	"""
	Calculate food metrics (calories, carbs, sugar) for each timestamp in the time series dataframe.

	Args:
	food_df (pd.DataFrame): DataFrame containing food log data
	time_series_df (pd.DataFrame): DataFrame to add food metrics to
	window_size (str, optional): Time window to consider for calculations. Defaults to '1h'.

	Returns:
	pd.DataFrame: Original DataFrame with added food metrics columns
	"""
	# Initialize arrays for food metrics
	calories = []
	carbs = []
	sugar = []

	window_timedelta = pd.Timedelta(window_size)

	for ts in time_series_df['Timestamp']:
	# Select only food data within the time window
	food_in_window = food_df[(food_df['Timestamp'] >= ts - window_timedelta) &
	(food_df['Timestamp'] <= ts)]

	# Calculate cumulative values
	if not food_in_window.empty:
	calories.append(food_in_window['calorie'].sum())
	carbs.append(food_in_window['total_carb'].sum())
	sugar.append(food_in_window['sugar'].sum())
	else:
	calories.append(0.0)
	carbs.append(0.0)
	sugar.append(0.0)

	# Add to time series dataframe
	time_series_df['Calories'] = calories
	time_series_df['Carbs'] = carbs
	time_series_df['Sugar'] = sugar

	return time_series_df

	def calculate_age(born, as_of_date=pd.Timestamp('2019-01-01')):
	"""
	Calculate age based on date of birth.

	Args:
	born (str or timestamp): Date of birth
	as_of_date (pd.Timestamp, optional): Reference date for age calculation.
	Defaults to January 1, 2019.

	Returns:
	int: Age in years
	"""
	born = pd.Timestamp(born)

	# Calculate age
	age = as_of_date.year - born.year

	return age

	def split_train_test_patients(df, seed=42):
	"""
	Split dataset into training, validation, and test sets based on patient IDs.

	Args:
	df (pd.DataFrame): Combined dataset with patient_id column
	seed (int, optional): Random seed for reproducibility. Defaults to 42.

	Returns:
	tuple: (training DataFrame, validation DataFrame, test DataFrame)
	"""
	np.random.seed(seed)
	training_patients = np.random.choice(np.arange(1, 16), size=13, replace=False)

	test_patients = np.setdiff1d(np.arange(1, 16), training_patients)

	validation_patients = np.random.choice(training_patients, size=2, replace=False)

	training_patients = np.setdiff1d(training_patients, validation_patients)

	df_train = df[df['patient_id'].isin(training_patients)]
	df_val = df[df['patient_id'].isin(validation_patients)]
	df_test = df[df['patient_id'].isin(test_patients)]

	return df_train, df_val, df_test

	def create_features(bg_df, acc_df, food_df, gender, hba1c, add_patient_id = False):
	"""
	Process raw data and create a time series DataFrame with features from multiple sources.

	Args:
	bg_df (pd.DataFrame): Blood glucose data
	acc_df (pd.DataFrame): Accelerometer data
	food_df (pd.DataFrame): Food log data
	gender (str): Patient gender
	hba1c (float): Patient HbA1c value

	Returns:
	pd.DataFrame: Time series DataFrame with combined features
	"""
	# Clean and convert 'Timestamp' columns to datetime format
	bg_df['Timestamp'] = pd.to_datetime(bg_df['Timestamp (YYYY-MM-DDThh:mm:ss)'], errors='coerce')
	acc_df['Timestamp'] = pd.to_datetime(acc_df['datetime'], errors='coerce')
	food_df['Timestamp'] = pd.to_datetime(food_df['time_begin'], errors='coerce')

	# Sort values by date time
	bg_df = bg_df.sort_values(by='Timestamp')
	acc_df = acc_df.sort_values(by='Timestamp')

	# Reset index and then find the row where 'Event Type' is 'DateOfBirth'
	reset_df = bg_df.reset_index(drop=True)
	patient_dob = reset_df[reset_df['Event Type'] == 'DateOfBirth']['Patient Info'].values[0]

	patient_age = calculate_age(patient_dob)

	bg_df = clean_blood_glucose_df(bg_df)

	# Initialize a new DataFrame for the time series
	time_series_df = pd.DataFrame(index=bg_df.index) # Use the glucose timestamps as the index

	time_series_df[['Timestamp','Glucose']] = bg_df[['Timestamp','Glucose Value (mg/dL)']]

	# time_series_df = get_acc_hr_values(acc_df, hr_df, time_series_df)
	time_series_df = get_accelerometer_values(acc_df, time_series_df)
	time_series_df = get_food_values(food_df, time_series_df)

	time_series_df['Gender'] = np.where(gender == 'FEMALE', 1, 0)
	time_series_df['HbA1c'] = hba1c
	time_series_df['Age'] = patient_age

	if add_patient_id:
	time_series_df['patient_id'] = 0

	return time_series_df

	def create_dataframes():
	"""
	Create individual patient dataframes by processing raw data files.

	Reads data for patients 1-16, processes it, and saves individual CSV files
	for each patient in the processed/dataset_by_patient directory.

	Returns:
	None
	"""
	data_path = os.path.join(SCRIPT_DIR, "data", "raw", "big_ideas_dataset")

	for i in range(1, 17):
	patient = f"{i:03d}"

	print("Patient"+str(i))

	# Load files
	bg_df = pd.read_csv(os.path.join(data_path, patient, f"Dexcom_{patient}.csv"))
	acc_df = pd.read_csv(os.path.join(data_path, patient, f"ACC_{patient}.csv"))
	food_df = pd.read_csv(os.path.join(data_path, patient, f"Food_Log_{patient}.csv"))
	demographic_data = pd.read_csv(os.path.join(data_path, "Demographics.csv"))

	patient_demographics = demographic_data[demographic_data['ID'] == i]

	gender = patient_demographics['Gender'].values[0] # Assuming you want the first value

	hba1c = patient_demographics['HbA1c'].values[0]

	time_series_df = create_features(bg_df, acc_df, food_df, gender, hba1c)

	output_dir = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient")
	# Create directory if it doesn't exist
	os.makedirs(output_dir, exist_ok=True)

	output_path = os.path.join(output_dir, f"patient_{patient}.csv")
	time_series_df.to_csv(output_path)

	return

	def combine_dataframes():
	"""
	Combine individual patient dataframes into a single dataset and create
	train/validation/test splits.

	Reads the individual patient CSV files, combines them, and creates
	split datasets based on patient IDs for train, validation, and test sets.

	Returns:
	None
	"""
	data_path = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient")
	combined_df = pd.DataFrame()

	for i in range(1, 17):
	patient = f"{i:03d}"

	print(f"Patient {i}")

	current_df = pd.read_csv(os.path.join(data_path, f"patient_{patient}.csv"))

	current_df["patient_id"] = i

	combined_df = pd.concat([combined_df, current_df], ignore_index=True)

	combined_df = combined_df.iloc[:, 1:]

	df_train, df_val, df_test = split_train_test_patients(combined_df)

	output_path = os.path.join(SCRIPT_DIR, "data", "processed")
	# Create directory if it doesn't exist
	os.makedirs(output_path, exist_ok=True)

	combined_df.to_csv(os.path.join(output_path, "combined_dataset.csv"))
	df_train.to_csv(os.path.join(output_path, "train_dataset.csv"))
	df_val.to_csv(os.path.join(output_path, "validation_dataset.csv"))
	df_test.to_csv(os.path.join(output_path, "test_dataset.csv"))

	return

	def main():
	"""
	Main function to run the dataset creation pipeline.

	Executes the full data processing workflow:
	1. Creates individual patient dataframes
	2. Combines them into a single dataset
	3. Creates train/validation/test splits

	Returns:
	None
	"""
	print("Running make_dataset script...")
	create_dataframes()
	combine_dataframes()

	return

	if __name__ == '__main__':
	main()