Spaces:
Sleeping
Sleeping
File size: 10,443 Bytes
6e8eb41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 |
import pandas as pd
import numpy as np
import json
import os
# Get the directory where the script is located
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
def clean_blood_glucose_df(bg_df):
"""
Filter a blood glucose dataframe to keep only rows where Event Type is 'EGV' (Estimated Glucose Value).
Args:
bg_df (pd.DataFrame): DataFrame containing blood glucose data
Returns:
pd.DataFrame: Filtered DataFrame with only EGV events
"""
# Filter the rows where Event Type is 'EGV'
bg_df = bg_df[bg_df['Event Type'] == 'EGV']
return bg_df
def get_accelerometer_values(acc_df, time_series_df, window_size='1h'):
"""
Calculate accelerometer magnitude values and add them to the time series dataframe.
Uses a weighted average where more recent values have higher weight.
Args:
acc_df (pd.DataFrame): DataFrame containing accelerometer data
time_series_df (pd.DataFrame): DataFrame to add accelerometer values to
window_size (str, optional): Time window to consider for calculations. Defaults to '1h'.
Returns:
pd.DataFrame: Original DataFrame with added accelerometer magnitude values
"""
# Calculate magnitude for accelerometer data
acc_df['Magnitude'] = np.sqrt(acc_df[' acc_x']**2 + acc_df[' acc_y']**2 + acc_df[' acc_z']**2).round(2)
acc_df['Magnitude'] = pd.to_numeric(acc_df['Magnitude'], errors='coerce')
weighted_avgs = []
window_timedelta = pd.Timedelta(window_size)
for ts in time_series_df['Timestamp']:
# Select only accelerometer data within the time window
relevant_acc = acc_df[(acc_df['Timestamp'] >= ts - window_timedelta) & (acc_df['Timestamp'] <= ts)]
if not relevant_acc.empty:
# Compute weighted average: more recent values have higher weight
time_diffs = (ts - relevant_acc['Timestamp']).dt.total_seconds()
weights = 1 / (time_diffs + 1) # Avoid division by zero
weighted_avg = ((relevant_acc['Magnitude'] * weights).sum() / weights.sum()).round(2)
else:
weighted_avg = 0
weighted_avgs.append(weighted_avg)
time_series_df['Accelerometer'] = weighted_avgs
return time_series_df
def get_food_values(food_df, time_series_df, window_size='1h'):
"""
Calculate food metrics (calories, carbs, sugar) for each timestamp in the time series dataframe.
Args:
food_df (pd.DataFrame): DataFrame containing food log data
time_series_df (pd.DataFrame): DataFrame to add food metrics to
window_size (str, optional): Time window to consider for calculations. Defaults to '1h'.
Returns:
pd.DataFrame: Original DataFrame with added food metrics columns
"""
# Initialize arrays for food metrics
calories = []
carbs = []
sugar = []
window_timedelta = pd.Timedelta(window_size)
for ts in time_series_df['Timestamp']:
# Select only food data within the time window
food_in_window = food_df[(food_df['Timestamp'] >= ts - window_timedelta) &
(food_df['Timestamp'] <= ts)]
# Calculate cumulative values
if not food_in_window.empty:
calories.append(food_in_window['calorie'].sum())
carbs.append(food_in_window['total_carb'].sum())
sugar.append(food_in_window['sugar'].sum())
else:
calories.append(0.0)
carbs.append(0.0)
sugar.append(0.0)
# Add to time series dataframe
time_series_df['Calories'] = calories
time_series_df['Carbs'] = carbs
time_series_df['Sugar'] = sugar
return time_series_df
def calculate_age(born, as_of_date=pd.Timestamp('2019-01-01')):
"""
Calculate age based on date of birth.
Args:
born (str or timestamp): Date of birth
as_of_date (pd.Timestamp, optional): Reference date for age calculation.
Defaults to January 1, 2019.
Returns:
int: Age in years
"""
born = pd.Timestamp(born)
# Calculate age
age = as_of_date.year - born.year
return age
def split_train_test_patients(df, seed=42):
"""
Split dataset into training, validation, and test sets based on patient IDs.
Args:
df (pd.DataFrame): Combined dataset with patient_id column
seed (int, optional): Random seed for reproducibility. Defaults to 42.
Returns:
tuple: (training DataFrame, validation DataFrame, test DataFrame)
"""
np.random.seed(seed)
training_patients = np.random.choice(np.arange(1, 16), size=13, replace=False)
test_patients = np.setdiff1d(np.arange(1, 16), training_patients)
validation_patients = np.random.choice(training_patients, size=2, replace=False)
training_patients = np.setdiff1d(training_patients, validation_patients)
df_train = df[df['patient_id'].isin(training_patients)]
df_val = df[df['patient_id'].isin(validation_patients)]
df_test = df[df['patient_id'].isin(test_patients)]
return df_train, df_val, df_test
def create_features(bg_df, acc_df, food_df, gender, hba1c, add_patient_id = False):
"""
Process raw data and create a time series DataFrame with features from multiple sources.
Args:
bg_df (pd.DataFrame): Blood glucose data
acc_df (pd.DataFrame): Accelerometer data
food_df (pd.DataFrame): Food log data
gender (str): Patient gender
hba1c (float): Patient HbA1c value
Returns:
pd.DataFrame: Time series DataFrame with combined features
"""
# Clean and convert 'Timestamp' columns to datetime format
bg_df['Timestamp'] = pd.to_datetime(bg_df['Timestamp (YYYY-MM-DDThh:mm:ss)'], errors='coerce')
acc_df['Timestamp'] = pd.to_datetime(acc_df['datetime'], errors='coerce')
food_df['Timestamp'] = pd.to_datetime(food_df['time_begin'], errors='coerce')
# Sort values by date time
bg_df = bg_df.sort_values(by='Timestamp')
acc_df = acc_df.sort_values(by='Timestamp')
# Reset index and then find the row where 'Event Type' is 'DateOfBirth'
reset_df = bg_df.reset_index(drop=True)
patient_dob = reset_df[reset_df['Event Type'] == 'DateOfBirth']['Patient Info'].values[0]
patient_age = calculate_age(patient_dob)
bg_df = clean_blood_glucose_df(bg_df)
# Initialize a new DataFrame for the time series
time_series_df = pd.DataFrame(index=bg_df.index) # Use the glucose timestamps as the index
time_series_df[['Timestamp','Glucose']] = bg_df[['Timestamp','Glucose Value (mg/dL)']]
# time_series_df = get_acc_hr_values(acc_df, hr_df, time_series_df)
time_series_df = get_accelerometer_values(acc_df, time_series_df)
time_series_df = get_food_values(food_df, time_series_df)
time_series_df['Gender'] = np.where(gender == 'FEMALE', 1, 0)
time_series_df['HbA1c'] = hba1c
time_series_df['Age'] = patient_age
if add_patient_id:
time_series_df['patient_id'] = 0
return time_series_df
def create_dataframes():
"""
Create individual patient dataframes by processing raw data files.
Reads data for patients 1-16, processes it, and saves individual CSV files
for each patient in the processed/dataset_by_patient directory.
Returns:
None
"""
data_path = os.path.join(SCRIPT_DIR, "data", "raw", "big_ideas_dataset")
for i in range(1, 17):
patient = f"{i:03d}"
print("Patient"+str(i))
# Load files
bg_df = pd.read_csv(os.path.join(data_path, patient, f"Dexcom_{patient}.csv"))
acc_df = pd.read_csv(os.path.join(data_path, patient, f"ACC_{patient}.csv"))
food_df = pd.read_csv(os.path.join(data_path, patient, f"Food_Log_{patient}.csv"))
demographic_data = pd.read_csv(os.path.join(data_path, "Demographics.csv"))
patient_demographics = demographic_data[demographic_data['ID'] == i]
gender = patient_demographics['Gender'].values[0] # Assuming you want the first value
hba1c = patient_demographics['HbA1c'].values[0]
time_series_df = create_features(bg_df, acc_df, food_df, gender, hba1c)
output_dir = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient")
# Create directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"patient_{patient}.csv")
time_series_df.to_csv(output_path)
return
def combine_dataframes():
"""
Combine individual patient dataframes into a single dataset and create
train/validation/test splits.
Reads the individual patient CSV files, combines them, and creates
split datasets based on patient IDs for train, validation, and test sets.
Returns:
None
"""
data_path = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient")
combined_df = pd.DataFrame()
for i in range(1, 17):
patient = f"{i:03d}"
print(f"Patient {i}")
current_df = pd.read_csv(os.path.join(data_path, f"patient_{patient}.csv"))
current_df["patient_id"] = i
combined_df = pd.concat([combined_df, current_df], ignore_index=True)
combined_df = combined_df.iloc[:, 1:]
df_train, df_val, df_test = split_train_test_patients(combined_df)
output_path = os.path.join(SCRIPT_DIR, "data", "processed")
# Create directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)
combined_df.to_csv(os.path.join(output_path, "combined_dataset.csv"))
df_train.to_csv(os.path.join(output_path, "train_dataset.csv"))
df_val.to_csv(os.path.join(output_path, "validation_dataset.csv"))
df_test.to_csv(os.path.join(output_path, "test_dataset.csv"))
return
def main():
"""
Main function to run the dataset creation pipeline.
Executes the full data processing workflow:
1. Creates individual patient dataframes
2. Combines them into a single dataset
3. Creates train/validation/test splits
Returns:
None
"""
print("Running make_dataset script...")
create_dataframes()
combine_dataframes()
return
if __name__ == '__main__':
main() |