import streamlit as st import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.svm import SVC, SVR from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.neural_network import MLPClassifier, MLPRegressor from sklearn.metrics import ( accuracy_score, mean_squared_error, mean_absolute_error, r2_score, classification_report ) class MachineLearningApp: def __init__(self): st.set_page_config( page_title="ML Model Selection App", page_icon=":robot_face:", layout="wide" ) self.initialize_session_state() def initialize_session_state(self): """Initialize all session state variables""" initial_states = { 'data': None, 'X': None, 'y': None, 'model': None, 'scaler': None, 'label_encoder': None, 'problem_type': None, 'test_size': 0.2, 'selected_features': [], 'target_column': None, 'selected_model': None, 'model_results': None } for key, value in initial_states.items(): if key not in st.session_state: st.session_state[key] = value def sidebar_data_upload(self): """Sidebar for data upload""" with st.sidebar: st.header("📊 Data Upload") uploaded_file = st.file_uploader( "Choose a CSV or Excel file", type=['csv', 'xlsx', 'xls'] ) return uploaded_file def sidebar_feature_selection(self, df): """Sidebar for feature and target selection""" with st.sidebar: st.header("🔍 Feature Selection") if df is None: st.warning("Please upload a dataset first.") return None, None, None numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist() categorical_cols = df.select_dtypes(include=['object']).columns.tolist() selected_features = st.multiselect( "Select Features", options=list(df.columns), default=numeric_cols ) target_column = st.selectbox( "Select Target Column", options=list(df.columns) ) test_size = st.slider( "Test Set Percentage", min_value=0.1, max_value=0.5, value=0.2, step=0.05, help="Percentage of data to use for testing" ) return selected_features, target_column, test_size def sidebar_model_selection(self, problem_type): """Sidebar for model selection""" with st.sidebar: st.header("🤖 Model Selection") if problem_type == 'classification': models = { 'Logistic Regression': LogisticRegression(), 'Decision Tree': DecisionTreeClassifier(), 'Random Forest': RandomForestClassifier(), 'SVM': SVC(), 'Naive Bayes (Gaussian)': GaussianNB(), 'Naive Bayes (Multinomial)': MultinomialNB(), 'K-Nearest Neighbors': KNeighborsClassifier(), 'Neural Network': MLPClassifier(max_iter=1000) } else: models = { 'Linear Regression': LinearRegression(), 'Decision Tree': DecisionTreeRegressor(), 'Random Forest': RandomForestRegressor(), 'SVR': SVR(), 'K-Nearest Neighbors': KNeighborsRegressor(), 'Neural Network': MLPRegressor(max_iter=1000) } selected_model = st.selectbox( "Choose a Model", options=list(models.keys()) ) return models, selected_model def sidebar_prediction_input(self, selected_features): """Sidebar for prediction input""" with st.sidebar: st.header("🔮 Prediction Input") if st.session_state.model is None: st.warning("Please train a model first.") return None prediction_inputs = {} for feature in selected_features: prediction_inputs[feature] = st.number_input( f"Enter {feature}", value=0.0, step=0.1 ) if st.button("Predict"): return prediction_inputs return None def load_and_display_data(self, uploaded_file): """Load data and display dataset information""" if uploaded_file is not None: try: if uploaded_file.name.endswith('.csv'): df = pd.read_csv(uploaded_file) else: df = pd.read_excel(uploaded_file) st.session_state.data = df col1, col2 = st.columns(2) with col1: st.subheader("📋 Dataset Preview") st.dataframe(df.head()) with col2: st.subheader("📊 Dataset Information") st.write(f"Total Rows: {df.shape[0]}") st.write(f"Total Columns: {df.shape[1]}") col_types = df.dtypes.value_counts() st.write("Column Types:") for dtype, count in col_types.items(): st.text(f"{dtype}: {count} columns") return df except Exception as e: st.error(f"Error loading file: {e}") return None def train_and_evaluate_model(self, X, y, test_size, models, selected_model_name): """Train and evaluate the selected model""" results_container = st.container() with results_container: X_scaled = StandardScaler().fit_transform(X) problem_type = 'classification' if y.dtype == 'object' else 'regression' label_encoder = None if problem_type == 'classification': label_encoder = LabelEncoder() y_encoded = label_encoder.fit_transform(y) else: y_encoded = y X_train, X_test, y_train, y_test = train_test_split( X_scaled, y_encoded, test_size=test_size, random_state=42 ) model = models[selected_model_name] model.fit(X_train, y_train) y_pred = model.predict(X_test) st.header("🔬 Model Training Results") col1, col2 = st.columns(2) with col1: st.subheader("📊 Model Performance") if problem_type == 'classification': accuracy = accuracy_score(y_test, y_pred) st.metric("Accuracy", f"{accuracy:.2%}") st.subheader("Classification Report") report = classification_report( y_test, y_pred, target_names=label_encoder.classes_ if label_encoder else None, output_dict=True ) for key, value in report.items(): if isinstance(value, dict): st.text(f"{key}:") for metric, score in value.items(): st.text(f" {metric}: {score:.2f}") else: mse = mean_squared_error(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) st.metric("Mean Squared Error", f"{mse:.4f}") st.metric("Mean Absolute Error", f"{mae:.4f}") st.metric("R² Score", f"{r2:.4f}") with col2: st.subheader("📈 Model Details") st.write(f"Selected Model: {selected_model_name}") st.write(f"Problem Type: {problem_type}") st.write(f"Test Set Size: {test_size:.0%}") st.write(f"Features Used: {', '.join(X.columns)}") st.write(f"Target Column: {y.name}") st.session_state.model = model st.session_state.scaler = StandardScaler().fit(X) st.session_state.label_encoder = label_encoder st.session_state.problem_type = problem_type st.session_state.X = X def make_prediction(self, prediction_inputs): """Make prediction on unseen data""" if st.session_state.model is None: st.error("Please train a model first.") return input_df = pd.DataFrame([prediction_inputs]) input_scaled = st.session_state.scaler.transform(input_df) prediction = st.session_state.model.predict(input_scaled) if st.session_state.label_encoder: prediction = st.session_state.label_encoder.inverse_transform(prediction) st.header("🎯 Prediction Result") st.subheader("Input Data") st.dataframe(input_df) st.subheader("Predicted Value") st.write(prediction[0]) def run(self): """Main application flow""" uploaded_file = self.sidebar_data_upload() st.title("🚀 Predict on Custom Data using any ML Model") df = self.load_and_display_data(uploaded_file) if df is not None: selected_features, target_column, test_size = self.sidebar_feature_selection(df) if selected_features and target_column: X = df[selected_features] y = df[target_column] problem_type = 'classification' if y.dtype == 'object' else 'regression' models, selected_model = self.sidebar_model_selection(problem_type) with st.sidebar: if st.button("Train Model", type="primary"): for key in ['model', 'scaler', 'label_encoder', 'problem_type']: st.session_state[key] = None self.train_and_evaluate_model( X, y, test_size, models, selected_model ) prediction_inputs = self.sidebar_prediction_input(selected_features) if prediction_inputs: self.make_prediction(prediction_inputs) if __name__ == "__main__": app = MachineLearningApp() app.run()