import streamlit as st import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier from imblearn.over_sampling import SMOTENC from sklearn.metrics import classification_report, roc_auc_score from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline # Function to generate synthetic dataset def generate_synthetic_data(): np.random.seed(42) n_samples = 1000 data = pd.DataFrame({ "age": np.random.randint(18, 90, n_samples), "gender": np.random.choice(["Male", "Female"], n_samples), "mechanical_ventilation": np.random.choice([0, 1], n_samples), "dialysis": np.random.choice([0, 1], n_samples), "gcs": np.random.randint(3, 15, n_samples), "sodium": np.random.uniform(135, 145, n_samples), "heart_rate": np.random.randint(60, 120, n_samples), "creatinine": np.random.uniform(0.5, 3.5, n_samples), "discharge_destination": np.random.choice(["Home", "Nursing Facility", "Rehabilitation", "Death"], n_samples, p=[0.6, 0.2, 0.1, 0.1]) }) return data # Title st.title("Discharge Destination Prediction") # Generate and display synthetic dataset data = generate_synthetic_data() st.write("Synthetic Dataset Preview:") st.dataframe(data.head()) # Select predictors and target predictors = st.multiselect("Select predictor features", data.columns.tolist(), default=["age", "gender", "mechanical_ventilation", "dialysis", "gcs", "sodium", "heart_rate", "creatinine"]) target = st.selectbox("Select target column", ["discharge_destination"]) # Split data X = data[predictors] y = data[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) # Preprocessing categorical_features = X.select_dtypes(include=['object', 'category']).columns numerical_features = X.select_dtypes(include=['int64', 'float64']).columns preprocessor = ColumnTransformer([ ('num', StandardScaler(), numerical_features), ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) ]) # SMOTENC for imbalance smote_nc = SMOTENC( categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42 ) X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train) # Select model model_name = st.selectbox("Select a model", ["Random Forest", "XGBoost"]) if model_name == "Random Forest": model = RandomForestClassifier(n_estimators=100, random_state=42) else: model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42) # Build pipeline pipeline = Pipeline([ ('preprocessor', preprocessor), ('classifier', model) ]) # Train and evaluate if st.button("Train Model"): pipeline.fit(X_train_resampled, y_train_resampled) y_pred = pipeline.predict(X_test) # Display results st.write("Classification Report:") st.text(classification_report(y_test, y_pred)) roc_auc = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred), multi_class='ovr') st.write(f"ROC AUC Score: {roc_auc:.4f}")