File size: 3,248 Bytes
17e472a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Function to generate synthetic dataset
def generate_synthetic_data():
    np.random.seed(42)
    n_samples = 1000
    data = pd.DataFrame({
        "age": np.random.randint(18, 90, n_samples),
        "gender": np.random.choice(["Male", "Female"], n_samples),
        "mechanical_ventilation": np.random.choice([0, 1], n_samples),
        "dialysis": np.random.choice([0, 1], n_samples),
        "gcs": np.random.randint(3, 15, n_samples),
        "sodium": np.random.uniform(135, 145, n_samples),
        "heart_rate": np.random.randint(60, 120, n_samples),
        "creatinine": np.random.uniform(0.5, 3.5, n_samples),
        "discharge_destination": np.random.choice(["Home", "Nursing Facility", "Rehabilitation", "Death"], n_samples, p=[0.6, 0.2, 0.1, 0.1])
    })
    return data

# Title
st.title("Discharge Destination Prediction")

# Generate and display synthetic dataset
data = generate_synthetic_data()
st.write("Synthetic Dataset Preview:")
st.dataframe(data.head())

# Select predictors and target
predictors = st.multiselect("Select predictor features", data.columns.tolist(), default=["age", "gender", "mechanical_ventilation", "dialysis", "gcs", "sodium", "heart_rate", "creatinine"])
target = st.selectbox("Select target column", ["discharge_destination"])

# Split data
X = data[predictors]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Preprocessing
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# SMOTENC for imbalance
smote_nc = SMOTENC(
    categorical_features=[X.columns.get_loc(col) for col in categorical_features], 
    random_state=42
)
X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train)

# Select model
model_name = st.selectbox("Select a model", ["Random Forest", "XGBoost"])
if model_name == "Random Forest":
    model = RandomForestClassifier(n_estimators=100, random_state=42)
else:
    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Build pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Train and evaluate
if st.button("Train Model"):
    pipeline.fit(X_train_resampled, y_train_resampled)
    y_pred = pipeline.predict(X_test)

    # Display results
    st.write("Classification Report:")
    st.text(classification_report(y_test, y_pred))

    roc_auc = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred), multi_class='ovr')
    st.write(f"ROC AUC Score: {roc_auc:.4f}")