BloodGlucosePrediction / scripts /make_dataset.py
iaravagni
Initial commit
6e8eb41
import pandas as pd
import numpy as np
import json
import os
# Get the directory where the script is located
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
def clean_blood_glucose_df(bg_df):
"""
Filter a blood glucose dataframe to keep only rows where Event Type is 'EGV' (Estimated Glucose Value).
Args:
bg_df (pd.DataFrame): DataFrame containing blood glucose data
Returns:
pd.DataFrame: Filtered DataFrame with only EGV events
"""
# Filter the rows where Event Type is 'EGV'
bg_df = bg_df[bg_df['Event Type'] == 'EGV']
return bg_df
def get_accelerometer_values(acc_df, time_series_df, window_size='1h'):
"""
Calculate accelerometer magnitude values and add them to the time series dataframe.
Uses a weighted average where more recent values have higher weight.
Args:
acc_df (pd.DataFrame): DataFrame containing accelerometer data
time_series_df (pd.DataFrame): DataFrame to add accelerometer values to
window_size (str, optional): Time window to consider for calculations. Defaults to '1h'.
Returns:
pd.DataFrame: Original DataFrame with added accelerometer magnitude values
"""
# Calculate magnitude for accelerometer data
acc_df['Magnitude'] = np.sqrt(acc_df[' acc_x']**2 + acc_df[' acc_y']**2 + acc_df[' acc_z']**2).round(2)
acc_df['Magnitude'] = pd.to_numeric(acc_df['Magnitude'], errors='coerce')
weighted_avgs = []
window_timedelta = pd.Timedelta(window_size)
for ts in time_series_df['Timestamp']:
# Select only accelerometer data within the time window
relevant_acc = acc_df[(acc_df['Timestamp'] >= ts - window_timedelta) & (acc_df['Timestamp'] <= ts)]
if not relevant_acc.empty:
# Compute weighted average: more recent values have higher weight
time_diffs = (ts - relevant_acc['Timestamp']).dt.total_seconds()
weights = 1 / (time_diffs + 1) # Avoid division by zero
weighted_avg = ((relevant_acc['Magnitude'] * weights).sum() / weights.sum()).round(2)
else:
weighted_avg = 0
weighted_avgs.append(weighted_avg)
time_series_df['Accelerometer'] = weighted_avgs
return time_series_df
def get_food_values(food_df, time_series_df, window_size='1h'):
"""
Calculate food metrics (calories, carbs, sugar) for each timestamp in the time series dataframe.
Args:
food_df (pd.DataFrame): DataFrame containing food log data
time_series_df (pd.DataFrame): DataFrame to add food metrics to
window_size (str, optional): Time window to consider for calculations. Defaults to '1h'.
Returns:
pd.DataFrame: Original DataFrame with added food metrics columns
"""
# Initialize arrays for food metrics
calories = []
carbs = []
sugar = []
window_timedelta = pd.Timedelta(window_size)
for ts in time_series_df['Timestamp']:
# Select only food data within the time window
food_in_window = food_df[(food_df['Timestamp'] >= ts - window_timedelta) &
(food_df['Timestamp'] <= ts)]
# Calculate cumulative values
if not food_in_window.empty:
calories.append(food_in_window['calorie'].sum())
carbs.append(food_in_window['total_carb'].sum())
sugar.append(food_in_window['sugar'].sum())
else:
calories.append(0.0)
carbs.append(0.0)
sugar.append(0.0)
# Add to time series dataframe
time_series_df['Calories'] = calories
time_series_df['Carbs'] = carbs
time_series_df['Sugar'] = sugar
return time_series_df
def calculate_age(born, as_of_date=pd.Timestamp('2019-01-01')):
"""
Calculate age based on date of birth.
Args:
born (str or timestamp): Date of birth
as_of_date (pd.Timestamp, optional): Reference date for age calculation.
Defaults to January 1, 2019.
Returns:
int: Age in years
"""
born = pd.Timestamp(born)
# Calculate age
age = as_of_date.year - born.year
return age
def split_train_test_patients(df, seed=42):
"""
Split dataset into training, validation, and test sets based on patient IDs.
Args:
df (pd.DataFrame): Combined dataset with patient_id column
seed (int, optional): Random seed for reproducibility. Defaults to 42.
Returns:
tuple: (training DataFrame, validation DataFrame, test DataFrame)
"""
np.random.seed(seed)
training_patients = np.random.choice(np.arange(1, 16), size=13, replace=False)
test_patients = np.setdiff1d(np.arange(1, 16), training_patients)
validation_patients = np.random.choice(training_patients, size=2, replace=False)
training_patients = np.setdiff1d(training_patients, validation_patients)
df_train = df[df['patient_id'].isin(training_patients)]
df_val = df[df['patient_id'].isin(validation_patients)]
df_test = df[df['patient_id'].isin(test_patients)]
return df_train, df_val, df_test
def create_features(bg_df, acc_df, food_df, gender, hba1c, add_patient_id = False):
"""
Process raw data and create a time series DataFrame with features from multiple sources.
Args:
bg_df (pd.DataFrame): Blood glucose data
acc_df (pd.DataFrame): Accelerometer data
food_df (pd.DataFrame): Food log data
gender (str): Patient gender
hba1c (float): Patient HbA1c value
Returns:
pd.DataFrame: Time series DataFrame with combined features
"""
# Clean and convert 'Timestamp' columns to datetime format
bg_df['Timestamp'] = pd.to_datetime(bg_df['Timestamp (YYYY-MM-DDThh:mm:ss)'], errors='coerce')
acc_df['Timestamp'] = pd.to_datetime(acc_df['datetime'], errors='coerce')
food_df['Timestamp'] = pd.to_datetime(food_df['time_begin'], errors='coerce')
# Sort values by date time
bg_df = bg_df.sort_values(by='Timestamp')
acc_df = acc_df.sort_values(by='Timestamp')
# Reset index and then find the row where 'Event Type' is 'DateOfBirth'
reset_df = bg_df.reset_index(drop=True)
patient_dob = reset_df[reset_df['Event Type'] == 'DateOfBirth']['Patient Info'].values[0]
patient_age = calculate_age(patient_dob)
bg_df = clean_blood_glucose_df(bg_df)
# Initialize a new DataFrame for the time series
time_series_df = pd.DataFrame(index=bg_df.index) # Use the glucose timestamps as the index
time_series_df[['Timestamp','Glucose']] = bg_df[['Timestamp','Glucose Value (mg/dL)']]
# time_series_df = get_acc_hr_values(acc_df, hr_df, time_series_df)
time_series_df = get_accelerometer_values(acc_df, time_series_df)
time_series_df = get_food_values(food_df, time_series_df)
time_series_df['Gender'] = np.where(gender == 'FEMALE', 1, 0)
time_series_df['HbA1c'] = hba1c
time_series_df['Age'] = patient_age
if add_patient_id:
time_series_df['patient_id'] = 0
return time_series_df
def create_dataframes():
"""
Create individual patient dataframes by processing raw data files.
Reads data for patients 1-16, processes it, and saves individual CSV files
for each patient in the processed/dataset_by_patient directory.
Returns:
None
"""
data_path = os.path.join(SCRIPT_DIR, "data", "raw", "big_ideas_dataset")
for i in range(1, 17):
patient = f"{i:03d}"
print("Patient"+str(i))
# Load files
bg_df = pd.read_csv(os.path.join(data_path, patient, f"Dexcom_{patient}.csv"))
acc_df = pd.read_csv(os.path.join(data_path, patient, f"ACC_{patient}.csv"))
food_df = pd.read_csv(os.path.join(data_path, patient, f"Food_Log_{patient}.csv"))
demographic_data = pd.read_csv(os.path.join(data_path, "Demographics.csv"))
patient_demographics = demographic_data[demographic_data['ID'] == i]
gender = patient_demographics['Gender'].values[0] # Assuming you want the first value
hba1c = patient_demographics['HbA1c'].values[0]
time_series_df = create_features(bg_df, acc_df, food_df, gender, hba1c)
output_dir = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient")
# Create directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"patient_{patient}.csv")
time_series_df.to_csv(output_path)
return
def combine_dataframes():
"""
Combine individual patient dataframes into a single dataset and create
train/validation/test splits.
Reads the individual patient CSV files, combines them, and creates
split datasets based on patient IDs for train, validation, and test sets.
Returns:
None
"""
data_path = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient")
combined_df = pd.DataFrame()
for i in range(1, 17):
patient = f"{i:03d}"
print(f"Patient {i}")
current_df = pd.read_csv(os.path.join(data_path, f"patient_{patient}.csv"))
current_df["patient_id"] = i
combined_df = pd.concat([combined_df, current_df], ignore_index=True)
combined_df = combined_df.iloc[:, 1:]
df_train, df_val, df_test = split_train_test_patients(combined_df)
output_path = os.path.join(SCRIPT_DIR, "data", "processed")
# Create directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
combined_df.to_csv(os.path.join(output_path, "combined_dataset.csv"))
df_train.to_csv(os.path.join(output_path, "train_dataset.csv"))
df_val.to_csv(os.path.join(output_path, "validation_dataset.csv"))
df_test.to_csv(os.path.join(output_path, "test_dataset.csv"))
return
def main():
"""
Main function to run the dataset creation pipeline.
Executes the full data processing workflow:
1. Creates individual patient dataframes
2. Combines them into a single dataset
3. Creates train/validation/test splits
Returns:
None
"""
print("Running make_dataset script...")
create_dataframes()
combine_dataframes()
return
if __name__ == '__main__':
main()