Spaces:

iaravagni
/

BloodGlucosePrediction

Sleeping

File size: 10,443 Bytes

6e8eb41

import pandas as pd
import numpy as np
import json
import os

# Get the directory where the script is located
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

def clean_blood_glucose_df(bg_df):
    """
    Filter a blood glucose dataframe to keep only rows where Event Type is 'EGV' (Estimated Glucose Value).
    
    Args:
        bg_df (pd.DataFrame): DataFrame containing blood glucose data
        
    Returns:
        pd.DataFrame: Filtered DataFrame with only EGV events
    """
    # Filter the rows where Event Type is 'EGV'
    bg_df = bg_df[bg_df['Event Type'] == 'EGV']
    return bg_df

def get_accelerometer_values(acc_df, time_series_df, window_size='1h'):
    """
    Calculate accelerometer magnitude values and add them to the time series dataframe.
    Uses a weighted average where more recent values have higher weight.
    
    Args:
        acc_df (pd.DataFrame): DataFrame containing accelerometer data
        time_series_df (pd.DataFrame): DataFrame to add accelerometer values to
        window_size (str, optional): Time window to consider for calculations. Defaults to '1h'.
        
    Returns:
        pd.DataFrame: Original DataFrame with added accelerometer magnitude values
    """
    # Calculate magnitude for accelerometer data
    acc_df['Magnitude'] = np.sqrt(acc_df[' acc_x']**2 + acc_df[' acc_y']**2 + acc_df[' acc_z']**2).round(2)
    acc_df['Magnitude'] = pd.to_numeric(acc_df['Magnitude'], errors='coerce')

    weighted_avgs = []
    window_timedelta = pd.Timedelta(window_size)
    
    for ts in time_series_df['Timestamp']:
        # Select only accelerometer data within the time window
        relevant_acc = acc_df[(acc_df['Timestamp'] >= ts - window_timedelta) & (acc_df['Timestamp'] <= ts)]
        
        if not relevant_acc.empty:
            # Compute weighted average: more recent values have higher weight
            time_diffs = (ts - relevant_acc['Timestamp']).dt.total_seconds()
            weights = 1 / (time_diffs + 1)  # Avoid division by zero
            weighted_avg = ((relevant_acc['Magnitude'] * weights).sum() / weights.sum()).round(2)
        else:
            weighted_avg = 0
        
        weighted_avgs.append(weighted_avg)

    
    time_series_df['Accelerometer'] = weighted_avgs
    
    return time_series_df

def get_food_values(food_df, time_series_df, window_size='1h'):
    """
    Calculate food metrics (calories, carbs, sugar) for each timestamp in the time series dataframe.
    
    Args:
        food_df (pd.DataFrame): DataFrame containing food log data
        time_series_df (pd.DataFrame): DataFrame to add food metrics to
        window_size (str, optional): Time window to consider for calculations. Defaults to '1h'.
        
    Returns:
        pd.DataFrame: Original DataFrame with added food metrics columns
    """
    # Initialize arrays for food metrics
    calories = []
    carbs = []
    sugar = []
    
    window_timedelta = pd.Timedelta(window_size)
    
    for ts in time_series_df['Timestamp']:
        # Select only food data within the time window
        food_in_window = food_df[(food_df['Timestamp'] >= ts - window_timedelta) & 
                                 (food_df['Timestamp'] <= ts)]
        
        # Calculate cumulative values
        if not food_in_window.empty:
            calories.append(food_in_window['calorie'].sum())
            carbs.append(food_in_window['total_carb'].sum())
            sugar.append(food_in_window['sugar'].sum())
        else:
            calories.append(0.0)
            carbs.append(0.0)
            sugar.append(0.0)
    
    # Add to time series dataframe
    time_series_df['Calories'] = calories
    time_series_df['Carbs'] = carbs
    time_series_df['Sugar'] = sugar
    
    return time_series_df

def calculate_age(born, as_of_date=pd.Timestamp('2019-01-01')):
    """
    Calculate age based on date of birth.
    
    Args:
        born (str or timestamp): Date of birth
        as_of_date (pd.Timestamp, optional): Reference date for age calculation. 
                                             Defaults to January 1, 2019.
        
    Returns:
        int: Age in years
    """
    born = pd.Timestamp(born)

    # Calculate age
    age = as_of_date.year - born.year
    
    return age

def split_train_test_patients(df, seed=42):
    """
    Split dataset into training, validation, and test sets based on patient IDs.
    
    Args:
        df (pd.DataFrame): Combined dataset with patient_id column
        seed (int, optional): Random seed for reproducibility. Defaults to 42.
        
    Returns:
        tuple: (training DataFrame, validation DataFrame, test DataFrame)
    """
    np.random.seed(seed)
    training_patients = np.random.choice(np.arange(1, 16), size=13, replace=False)

    test_patients = np.setdiff1d(np.arange(1, 16), training_patients)

    validation_patients = np.random.choice(training_patients, size=2, replace=False)

    training_patients = np.setdiff1d(training_patients, validation_patients)

    df_train = df[df['patient_id'].isin(training_patients)]
    df_val = df[df['patient_id'].isin(validation_patients)]
    df_test = df[df['patient_id'].isin(test_patients)]

    return df_train, df_val, df_test

def create_features(bg_df, acc_df, food_df, gender, hba1c, add_patient_id = False):
    """
    Process raw data and create a time series DataFrame with features from multiple sources.
    
    Args:
        bg_df (pd.DataFrame): Blood glucose data
        acc_df (pd.DataFrame): Accelerometer data
        food_df (pd.DataFrame): Food log data
        gender (str): Patient gender
        hba1c (float): Patient HbA1c value
        
    Returns:
        pd.DataFrame: Time series DataFrame with combined features
    """
    # Clean and convert 'Timestamp' columns to datetime format
    bg_df['Timestamp'] = pd.to_datetime(bg_df['Timestamp (YYYY-MM-DDThh:mm:ss)'], errors='coerce')
    acc_df['Timestamp'] = pd.to_datetime(acc_df['datetime'], errors='coerce')
    food_df['Timestamp'] = pd.to_datetime(food_df['time_begin'], errors='coerce')
    
    # Sort values by date time
    bg_df = bg_df.sort_values(by='Timestamp')
    acc_df = acc_df.sort_values(by='Timestamp')

    # Reset index and then find the row where 'Event Type' is 'DateOfBirth'
    reset_df = bg_df.reset_index(drop=True)
    patient_dob = reset_df[reset_df['Event Type'] == 'DateOfBirth']['Patient Info'].values[0]

    patient_age = calculate_age(patient_dob)

    bg_df = clean_blood_glucose_df(bg_df)
    
    # Initialize a new DataFrame for the time series
    time_series_df = pd.DataFrame(index=bg_df.index)  # Use the glucose timestamps as the index

    time_series_df[['Timestamp','Glucose']] = bg_df[['Timestamp','Glucose Value (mg/dL)']]

    # time_series_df = get_acc_hr_values(acc_df, hr_df, time_series_df) 
    time_series_df = get_accelerometer_values(acc_df, time_series_df)
    time_series_df = get_food_values(food_df, time_series_df)

    time_series_df['Gender'] = np.where(gender == 'FEMALE', 1, 0)
    time_series_df['HbA1c'] = hba1c
    time_series_df['Age'] = patient_age

    if add_patient_id:
        time_series_df['patient_id'] = 0
    
    return time_series_df

def create_dataframes():
    """
    Create individual patient dataframes by processing raw data files.
    
    Reads data for patients 1-16, processes it, and saves individual CSV files
    for each patient in the processed/dataset_by_patient directory.
    
    Returns:
        None
    """
    data_path = os.path.join(SCRIPT_DIR, "data", "raw", "big_ideas_dataset")

    for i in range(1, 17):
        patient = f"{i:03d}"

        print("Patient"+str(i))
                
        # Load files
        bg_df = pd.read_csv(os.path.join(data_path, patient, f"Dexcom_{patient}.csv"))
        acc_df = pd.read_csv(os.path.join(data_path, patient, f"ACC_{patient}.csv"))
        food_df = pd.read_csv(os.path.join(data_path, patient, f"Food_Log_{patient}.csv"))
        demographic_data = pd.read_csv(os.path.join(data_path, "Demographics.csv"))
        
        patient_demographics = demographic_data[demographic_data['ID'] == i]

        gender = patient_demographics['Gender'].values[0]  # Assuming you want the first value

        hba1c = patient_demographics['HbA1c'].values[0]

        time_series_df = create_features(bg_df, acc_df, food_df, gender, hba1c)

        output_dir = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient")
        # Create directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        output_path = os.path.join(output_dir, f"patient_{patient}.csv")
        time_series_df.to_csv(output_path)

    return

def combine_dataframes():
    """
    Combine individual patient dataframes into a single dataset and create
    train/validation/test splits.
    
    Reads the individual patient CSV files, combines them, and creates
    split datasets based on patient IDs for train, validation, and test sets.
    
    Returns:
        None
    """
    data_path = os.path.join(SCRIPT_DIR, "data", "processed", "dataset_by_patient")
    combined_df = pd.DataFrame()

    for i in range(1, 17):
        patient = f"{i:03d}"

        print(f"Patient {i}")

        current_df = pd.read_csv(os.path.join(data_path, f"patient_{patient}.csv"))

        current_df["patient_id"] = i

        combined_df = pd.concat([combined_df, current_df], ignore_index=True)

    combined_df = combined_df.iloc[:, 1:]

    df_train, df_val, df_test = split_train_test_patients(combined_df)

    output_path = os.path.join(SCRIPT_DIR, "data", "processed")
    # Create directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    combined_df.to_csv(os.path.join(output_path, "combined_dataset.csv"))
    df_train.to_csv(os.path.join(output_path, "train_dataset.csv"))
    df_val.to_csv(os.path.join(output_path, "validation_dataset.csv"))
    df_test.to_csv(os.path.join(output_path, "test_dataset.csv"))

    return

def main():
    """
    Main function to run the dataset creation pipeline.
    
    Executes the full data processing workflow:
    1. Creates individual patient dataframes
    2. Combines them into a single dataset
    3. Creates train/validation/test splits
    
    Returns:
        None
    """
    print("Running make_dataset script...")
    create_dataframes()
    combine_dataframes()

    return

if __name__ == '__main__':
    main()