Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler, LabelEncoder | |
from sklearn.linear_model import LogisticRegression, LinearRegression | |
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor | |
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor | |
from sklearn.svm import SVC, SVR | |
from sklearn.naive_bayes import GaussianNB, MultinomialNB | |
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor | |
from sklearn.neural_network import MLPClassifier, MLPRegressor | |
from sklearn.metrics import ( | |
accuracy_score, | |
mean_squared_error, | |
mean_absolute_error, | |
r2_score, | |
classification_report | |
) | |
class MachineLearningApp: | |
def __init__(self): | |
st.set_page_config( | |
page_title="ML Model Selection App", | |
page_icon=":robot_face:", | |
layout="wide" | |
) | |
self.initialize_session_state() | |
def initialize_session_state(self): | |
"""Initialize all session state variables""" | |
initial_states = { | |
'data': None, | |
'X': None, | |
'y': None, | |
'model': None, | |
'scaler': None, | |
'label_encoder': None, | |
'problem_type': None, | |
'test_size': 0.2, | |
'selected_features': [], | |
'target_column': None, | |
'selected_model': None, | |
'model_results': None | |
} | |
for key, value in initial_states.items(): | |
if key not in st.session_state: | |
st.session_state[key] = value | |
def sidebar_data_upload(self): | |
"""Sidebar for data upload""" | |
with st.sidebar: | |
st.header("๐ Data Upload") | |
uploaded_file = st.file_uploader( | |
"Choose a CSV or Excel file", | |
type=['csv', 'xlsx', 'xls'] | |
) | |
return uploaded_file | |
def sidebar_feature_selection(self, df): | |
"""Sidebar for feature and target selection""" | |
with st.sidebar: | |
st.header("๐ Feature Selection") | |
if df is None: | |
st.warning("Please upload a dataset first.") | |
return None, None, None | |
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist() | |
categorical_cols = df.select_dtypes(include=['object']).columns.tolist() | |
selected_features = st.multiselect( | |
"Select Features", | |
options=list(df.columns), | |
default=numeric_cols | |
) | |
target_column = st.selectbox( | |
"Select Target Column", | |
options=list(df.columns) | |
) | |
test_size = st.slider( | |
"Test Set Percentage", | |
min_value=0.1, | |
max_value=0.5, | |
value=0.2, | |
step=0.05, | |
help="Percentage of data to use for testing" | |
) | |
return selected_features, target_column, test_size | |
def sidebar_model_selection(self, problem_type): | |
"""Sidebar for model selection""" | |
with st.sidebar: | |
st.header("๐ค Model Selection") | |
if problem_type == 'classification': | |
models = { | |
'Logistic Regression': LogisticRegression(), | |
'Decision Tree': DecisionTreeClassifier(), | |
'Random Forest': RandomForestClassifier(), | |
'SVM': SVC(), | |
'Naive Bayes (Gaussian)': GaussianNB(), | |
'Naive Bayes (Multinomial)': MultinomialNB(), | |
'K-Nearest Neighbors': KNeighborsClassifier(), | |
'Neural Network': MLPClassifier(max_iter=1000) | |
} | |
else: | |
models = { | |
'Linear Regression': LinearRegression(), | |
'Decision Tree': DecisionTreeRegressor(), | |
'Random Forest': RandomForestRegressor(), | |
'SVR': SVR(), | |
'K-Nearest Neighbors': KNeighborsRegressor(), | |
'Neural Network': MLPRegressor(max_iter=1000) | |
} | |
selected_model = st.selectbox( | |
"Choose a Model", | |
options=list(models.keys()) | |
) | |
return models, selected_model | |
def sidebar_prediction_input(self, selected_features): | |
"""Sidebar for prediction input""" | |
with st.sidebar: | |
st.header("๐ฎ Prediction Input") | |
if st.session_state.model is None: | |
st.warning("Please train a model first.") | |
return None | |
prediction_inputs = {} | |
for feature in selected_features: | |
prediction_inputs[feature] = st.number_input( | |
f"Enter {feature}", | |
value=0.0, | |
step=0.1 | |
) | |
if st.button("Predict"): | |
return prediction_inputs | |
return None | |
def load_and_display_data(self, uploaded_file): | |
"""Load data and display dataset information""" | |
if uploaded_file is not None: | |
try: | |
if uploaded_file.name.endswith('.csv'): | |
df = pd.read_csv(uploaded_file) | |
else: | |
df = pd.read_excel(uploaded_file) | |
st.session_state.data = df | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("๐ Dataset Preview") | |
st.dataframe(df.head()) | |
with col2: | |
st.subheader("๐ Dataset Information") | |
st.write(f"Total Rows: {df.shape[0]}") | |
st.write(f"Total Columns: {df.shape[1]}") | |
col_types = df.dtypes.value_counts() | |
st.write("Column Types:") | |
for dtype, count in col_types.items(): | |
st.text(f"{dtype}: {count} columns") | |
return df | |
except Exception as e: | |
st.error(f"Error loading file: {e}") | |
return None | |
def train_and_evaluate_model(self, X, y, test_size, models, selected_model_name): | |
"""Train and evaluate the selected model""" | |
results_container = st.container() | |
with results_container: | |
X_scaled = StandardScaler().fit_transform(X) | |
problem_type = 'classification' if y.dtype == 'object' else 'regression' | |
label_encoder = None | |
if problem_type == 'classification': | |
label_encoder = LabelEncoder() | |
y_encoded = label_encoder.fit_transform(y) | |
else: | |
y_encoded = y | |
X_train, X_test, y_train, y_test = train_test_split( | |
X_scaled, y_encoded, test_size=test_size, random_state=42 | |
) | |
model = models[selected_model_name] | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
st.header("๐ฌ Model Training Results") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("๐ Model Performance") | |
if problem_type == 'classification': | |
accuracy = accuracy_score(y_test, y_pred) | |
st.metric("Accuracy", f"{accuracy:.2%}") | |
st.subheader("Classification Report") | |
report = classification_report( | |
y_test, y_pred, | |
target_names=label_encoder.classes_ if label_encoder else None, | |
output_dict=True | |
) | |
for key, value in report.items(): | |
if isinstance(value, dict): | |
st.text(f"{key}:") | |
for metric, score in value.items(): | |
st.text(f" {metric}: {score:.2f}") | |
else: | |
mse = mean_squared_error(y_test, y_pred) | |
mae = mean_absolute_error(y_test, y_pred) | |
r2 = r2_score(y_test, y_pred) | |
st.metric("Mean Squared Error", f"{mse:.4f}") | |
st.metric("Mean Absolute Error", f"{mae:.4f}") | |
st.metric("Rยฒ Score", f"{r2:.4f}") | |
with col2: | |
st.subheader("๐ Model Details") | |
st.write(f"Selected Model: {selected_model_name}") | |
st.write(f"Problem Type: {problem_type}") | |
st.write(f"Test Set Size: {test_size:.0%}") | |
st.write(f"Features Used: {', '.join(X.columns)}") | |
st.write(f"Target Column: {y.name}") | |
st.session_state.model = model | |
st.session_state.scaler = StandardScaler().fit(X) | |
st.session_state.label_encoder = label_encoder | |
st.session_state.problem_type = problem_type | |
st.session_state.X = X | |
def make_prediction(self, prediction_inputs): | |
"""Make prediction on unseen data""" | |
if st.session_state.model is None: | |
st.error("Please train a model first.") | |
return | |
input_df = pd.DataFrame([prediction_inputs]) | |
input_scaled = st.session_state.scaler.transform(input_df) | |
prediction = st.session_state.model.predict(input_scaled) | |
if st.session_state.label_encoder: | |
prediction = st.session_state.label_encoder.inverse_transform(prediction) | |
st.header("๐ฏ Prediction Result") | |
st.subheader("Input Data") | |
st.dataframe(input_df) | |
st.subheader("Predicted Value") | |
st.write(prediction[0]) | |
def run(self): | |
"""Main application flow""" | |
uploaded_file = self.sidebar_data_upload() | |
st.title("๐ Predict on Custom Data using any ML Model") | |
df = self.load_and_display_data(uploaded_file) | |
if df is not None: | |
selected_features, target_column, test_size = self.sidebar_feature_selection(df) | |
if selected_features and target_column: | |
X = df[selected_features] | |
y = df[target_column] | |
problem_type = 'classification' if y.dtype == 'object' else 'regression' | |
models, selected_model = self.sidebar_model_selection(problem_type) | |
with st.sidebar: | |
if st.button("Train Model", type="primary"): | |
for key in ['model', 'scaler', 'label_encoder', 'problem_type']: | |
st.session_state[key] = None | |
self.train_and_evaluate_model( | |
X, y, test_size, models, selected_model | |
) | |
prediction_inputs = self.sidebar_prediction_input(selected_features) | |
if prediction_inputs: | |
self.make_prediction(prediction_inputs) | |
if __name__ == "__main__": | |
app = MachineLearningApp() | |
app.run() |