Somnath3570's picture
Create app.py
17e472a verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Function to generate synthetic dataset
def generate_synthetic_data():
np.random.seed(42)
n_samples = 1000
data = pd.DataFrame({
"age": np.random.randint(18, 90, n_samples),
"gender": np.random.choice(["Male", "Female"], n_samples),
"mechanical_ventilation": np.random.choice([0, 1], n_samples),
"dialysis": np.random.choice([0, 1], n_samples),
"gcs": np.random.randint(3, 15, n_samples),
"sodium": np.random.uniform(135, 145, n_samples),
"heart_rate": np.random.randint(60, 120, n_samples),
"creatinine": np.random.uniform(0.5, 3.5, n_samples),
"discharge_destination": np.random.choice(["Home", "Nursing Facility", "Rehabilitation", "Death"], n_samples, p=[0.6, 0.2, 0.1, 0.1])
})
return data
# Title
st.title("Discharge Destination Prediction")
# Generate and display synthetic dataset
data = generate_synthetic_data()
st.write("Synthetic Dataset Preview:")
st.dataframe(data.head())
# Select predictors and target
predictors = st.multiselect("Select predictor features", data.columns.tolist(), default=["age", "gender", "mechanical_ventilation", "dialysis", "gcs", "sodium", "heart_rate", "creatinine"])
target = st.selectbox("Select target column", ["discharge_destination"])
# Split data
X = data[predictors]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# Preprocessing
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
preprocessor = ColumnTransformer([
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
# SMOTENC for imbalance
smote_nc = SMOTENC(
categorical_features=[X.columns.get_loc(col) for col in categorical_features],
random_state=42
)
X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train)
# Select model
model_name = st.selectbox("Select a model", ["Random Forest", "XGBoost"])
if model_name == "Random Forest":
model = RandomForestClassifier(n_estimators=100, random_state=42)
else:
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
# Build pipeline
pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', model)
])
# Train and evaluate
if st.button("Train Model"):
pipeline.fit(X_train_resampled, y_train_resampled)
y_pred = pipeline.predict(X_test)
# Display results
st.write("Classification Report:")
st.text(classification_report(y_test, y_pred))
roc_auc = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred), multi_class='ovr')
st.write(f"ROC AUC Score: {roc_auc:.4f}")