Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import json | |
import os | |
# Get the directory where the script is located | |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
def clean_blood_glucose_df(bg_df): | |
""" | |
Filter a blood glucose dataframe to keep only rows where Event Type is 'EGV' (Estimated Glucose Value). | |
Args: | |
bg_df (pd.DataFrame): DataFrame containing blood glucose data | |
Returns: | |
pd.DataFrame: Filtered DataFrame with only EGV events | |
""" | |
# Filter the rows where Event Type is 'EGV' | |
bg_df = bg_df[bg_df['Event Type'] == 'EGV'] | |
return bg_df | |
def get_accelerometer_values(acc_df, time_series_df, window_size='1h'): | |
""" | |
Calculate accelerometer magnitude values and add them to the time series dataframe. | |
Uses a weighted average where more recent values have higher weight. | |
Args: | |
acc_df (pd.DataFrame): DataFrame containing accelerometer data | |
time_series_df (pd.DataFrame): DataFrame to add accelerometer values to | |
window_size (str, optional): Time window to consider for calculations. Defaults to '1h'. | |
Returns: | |
pd.DataFrame: Original DataFrame with added accelerometer magnitude values | |
""" | |
# Calculate magnitude for accelerometer data | |
acc_df['Magnitude'] = np.sqrt(acc_df[' acc_x']**2 + acc_df[' acc_y']**2 + acc_df[' acc_z']**2).round(2) | |
acc_df['Magnitude'] = pd.to_numeric(acc_df['Magnitude'], errors='coerce') | |
weighted_avgs = [] | |
window_timedelta = pd.Timedelta(window_size) | |
for ts in time_series_df['Timestamp']: | |
# Select only accelerometer data within the time window | |
relevant_acc = acc_df[(acc_df['Timestamp'] >= ts - window_timedelta) & (acc_df['Timestamp'] <= ts)] | |
if not relevant_acc.empty: | |
# Compute weighted average: more recent values have higher weight | |
time_diffs = (ts - relevant_acc['Timestamp']).dt.total_seconds() | |
weights = 1 / (time_diffs + 1) # Avoid division by zero | |
weighted_avg = ((relevant_acc['Magnitude'] * weights).sum() / weights.sum()).round(2) | |
else: | |
weighted_avg = 0 | |
weighted_avgs.append(weighted_avg) | |
time_series_df['Accelerometer'] = weighted_avgs | |
return time_series_df | |
def get_food_values(food_df, time_series_df, window_size='1h'): | |
""" | |
Calculate food metrics (calories, carbs, sugar) for each timestamp in the time series dataframe. | |
Args: | |
food_df (pd.DataFrame): DataFrame containing food log data | |
time_series_df (pd.DataFrame): DataFrame to add food metrics to | |
window_size (str, optional): Time window to consider for calculations. Defaults to '1h'. | |
Returns: | |
pd.DataFrame: Original DataFrame with added food metrics columns | |
""" | |
# Initialize arrays for food metrics | |
calories = [] | |
carbs = [] | |
sugar = [] | |
window_timedelta = pd.Timedelta(window_size) | |
for ts in time_series_df['Timestamp']: | |
# Select only food data within the time window | |
food_in_window = food_df[(food_df['Timestamp'] >= ts - window_timedelta) & | |
(food_df['Timestamp'] <= ts)] | |
# Calculate cumulative values | |
if not food_in_window.empty: | |
calories.append(food_in_window['calorie'].sum()) | |
carbs.append(food_in_window['total_carb'].sum()) | |
sugar.append(food_in_window['sugar'].sum()) | |
else: | |
calories.append(0.0) | |
carbs.append(0.0) | |
sugar.append(0.0) | |
# Add to time series dataframe | |
time_series_df['Calories'] = calories | |
time_series_df['Carbs'] = carbs | |
time_series_df['Sugar'] = sugar | |
return time_series_df | |
def calculate_age(born, as_of_date=pd.Timestamp('2019-01-01')): | |
""" | |
Calculate age based on date of birth. | |
Args: | |
born (str or timestamp): Date of birth | |
as_of_date (pd.Timestamp, optional): Reference date for age calculation. | |
Defaults to January 1, 2019. | |
Returns: | |
int: Age in years | |
""" | |
born = pd.Timestamp(born) | |
# Calculate age | |
age = as_of_date.year - born.year | |
return age | |
def split_train_test_patients(df, seed=42): | |
""" | |
Split dataset into training, validation, and test sets based on patient IDs. | |
Args: | |
df (pd.DataFrame): Combined dataset with patient_id column | |
seed (int, optional): Random seed for reproducibility. Defaults to 42. | |
Returns: | |
tuple: (training DataFrame, validation DataFrame, test DataFrame) | |
""" | |
np.random.seed(seed) | |
training_patients = np.random.choice(np.arange(1, 16), size=13, replace=False) | |
test_patients = np.setdiff1d(np.arange(1, 16), training_patients) | |
validation_patients = np.random.choice(training_patients, size=2, replace=False) | |
training_patients = np.setdiff1d(training_patients, validation_patients) | |
df_train = df[df['patient_id'].isin(training_patients)] | |
df_val = df[df['patient_id'].isin(validation_patients)] | |
df_test = df[df['patient_id'].isin(test_patients)] | |
return df_train, df_val, df_test | |
def create_features(bg_df, acc_df, food_df, gender, hba1c, add_patient_id = False): | |
""" | |
Process raw data and create a time series DataFrame with features from multiple sources. | |
Args: | |
bg_df (pd.DataFrame): Blood glucose data | |
acc_df (pd.DataFrame): Accelerometer data | |
food_df (pd.DataFrame): Food log data | |
gender (str): Patient gender | |
hba1c (float): Patient HbA1c value | |
Returns: | |
pd.DataFrame: Time series DataFrame with combined features | |
""" | |
# Clean and convert 'Timestamp' columns to datetime format | |
bg_df['Timestamp'] = pd.to_datetime(bg_df['Timestamp (YYYY-MM-DDThh:mm:ss)'], errors='coerce') | |
acc_df['Timestamp'] = pd.to_datetime(acc_df['datetime'], errors='coerce') | |
food_df['Timestamp'] = pd.to_datetime(food_df['time_begin'], errors='coerce') | |
# Sort values by date time | |
bg_df = bg_df.sort_values(by='Timestamp') | |
acc_df = acc_df.sort_values(by='Timestamp') | |
# Reset index and then find the row where 'Event Type' is 'DateOfBirth' | |
reset_df = bg_df.reset_index(drop=True) | |
patient_dob = reset_df[reset_df['Event Type'] == 'DateOfBirth']['Patient Info'].values[0] | |
patient_age = calculate_age(patient_dob) | |
bg_df = clean_blood_glucose_df(bg_df) | |
# Initialize a new DataFrame for the time series | |
time_series_df = pd.DataFrame(index=bg_df.index) # Use the glucose timestamps as the index | |
time_series_df[['Timestamp','Glucose']] = bg_df[['Timestamp','Glucose Value (mg/dL)']] | |
# time_series_df = get_acc_hr_values(acc_df, hr_df, time_series_df) | |
time_series_df = get_accelerometer_values(acc_df, time_series_df) | |
time_series_df = get_food_values(food_df, time_series_df) | |
time_series_df['Gender'] = np.where(gender == 'FEMALE', 1, 0) | |
time_series_df['HbA1c'] = hba1c | |
time_series_df['Age'] = patient_age | |
if add_patient_id: | |
time_series_df['patient_id'] = 0 | |
return time_series_df | |
def create_dataframes(): | |
""" | |
Create individual patient dataframes by processing raw data files. | |
Reads data for patients 1-16, processes it, and saves individual CSV files | |
for each patient in the processed/dataset_by_patient directory. | |
Returns: | |
None | |
""" | |
data_path = os.path.join(SCRIPT_DIR, "data", "raw", "big_ideas_dataset") | |
for i in range(1, 17): | |
patient = f"{i:03d}" | |
print("Patient"+str(i)) | |
# Load files | |
bg_df = pd.read_csv(os.path.join(data_path, patient, f"Dexcom_{patient}.csv")) | |
acc_df = pd.read_csv(os.path.join(data_path, patient, f"ACC_{patient}.csv")) | |
food_df = pd.read_csv(os.path.join(data_path, patient, f"Food_Log_{patient}.csv")) | |
demographic_data = pd.read_csv(os.path.join(data_path, "Demographics.csv")) | |
patient_demographics = demographic_data[demographic_data['ID'] == i] | |
gender = patient_demographics['Gender'].values[0] # Assuming you want the first value | |
hba1c = patient_demographics['HbA1c'].values[0] | |
time_series_df = create_features(bg_df, acc_df, food_df, gender, hba1c) | |
output_dir = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient") | |
# Create directory if it doesn't exist | |
os.makedirs(output_dir, exist_ok=True) | |
output_path = os.path.join(output_dir, f"patient_{patient}.csv") | |
time_series_df.to_csv(output_path) | |
return | |
def combine_dataframes(): | |
""" | |
Combine individual patient dataframes into a single dataset and create | |
train/validation/test splits. | |
Reads the individual patient CSV files, combines them, and creates | |
split datasets based on patient IDs for train, validation, and test sets. | |
Returns: | |
None | |
""" | |
data_path = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient") | |
combined_df = pd.DataFrame() | |
for i in range(1, 17): | |
patient = f"{i:03d}" | |
print(f"Patient {i}") | |
current_df = pd.read_csv(os.path.join(data_path, f"patient_{patient}.csv")) | |
current_df["patient_id"] = i | |
combined_df = pd.concat([combined_df, current_df], ignore_index=True) | |
combined_df = combined_df.iloc[:, 1:] | |
df_train, df_val, df_test = split_train_test_patients(combined_df) | |
output_path = os.path.join(SCRIPT_DIR, "data", "processed") | |
# Create directory if it doesn't exist | |
os.makedirs(output_path, exist_ok=True) | |
combined_df.to_csv(os.path.join(output_path, "combined_dataset.csv")) | |
df_train.to_csv(os.path.join(output_path, "train_dataset.csv")) | |
df_val.to_csv(os.path.join(output_path, "validation_dataset.csv")) | |
df_test.to_csv(os.path.join(output_path, "test_dataset.csv")) | |
return | |
def main(): | |
""" | |
Main function to run the dataset creation pipeline. | |
Executes the full data processing workflow: | |
1. Creates individual patient dataframes | |
2. Combines them into a single dataset | |
3. Creates train/validation/test splits | |
Returns: | |
None | |
""" | |
print("Running make_dataset script...") | |
create_dataframes() | |
combine_dataframes() | |
return | |
if __name__ == '__main__': | |
main() |