import streamlit as st import pandas as pd import sweetviz as sv from pycaret.classification import setup as cls_setup, compare_models as cls_compare, save_model as cls_save, pull as cls_pull, plot_model as cls_plot from pycaret.regression import setup as reg_setup, compare_models as reg_compare, save_model as reg_save, pull as reg_pull, plot_model as reg_plot from pycaret.clustering import setup as clu_setup, create_model as clu_create, plot_model as clu_plot, save_model as clu_save, pull as clu_pull from pycaret.anomaly import setup as ano_setup, create_model as ano_create, plot_model as ano_plot, save_model as ano_save, pull as ano_pull from pycaret.time_series import setup as ts_setup, compare_models as ts_compare, save_model as ts_save, pull as ts_pull, plot_model as ts_plot from pycaret.datasets import get_data import streamlit.components.v1 as components import traceback from ydata_profiling import ProfileReport import os def get_all_datasets(): df = get_data('index') return df['Dataset'].to_list() def show_profile_reports(container): if os.path.exists("profile_report.html"): with open('profile_report.html', 'r') as f: html_content = f.read() with container: components.html(html_content, height=800, scrolling=True) if os.path.exists("sweetviz_report.html"): with open('sweetviz_report.html', 'r') as f: html_content = f.read() with container: components.html(html_content, height=800, scrolling=True) def data_profile(df,container): profile = ProfileReport(df) profile.to_file("profile_report.html") with open('profile_report.html', 'r') as f: html_content = f.read() with container: components.html(html_content, height=800, scrolling=True) def update_progress(progress_bar, step, max_steps): progress = int((step / max_steps) * 100) t = f"Processing....Step {step}/{max_steps}" if step == max_steps: t="Process Completed" progress_bar.progress(progress, text=t) def display_sweetviz_report(dataframe,container): report = sv.analyze(dataframe) report.show_html('sweetviz_report.html', open_browser=False) with open('sweetviz_report.html', 'r') as f: html_content = f.read() with container: components.html(html_content, height=800, scrolling=True) def handle_exception(e): st.error( f"""The app has encountered an error: **{e}** Please check settings - columns selections and model parameters Or Create an issue [here](https://github.com/bitbotcoder/mlwiz/issues/new) with the below error details """, icon="🥺", ) with st.expander("See Error details"): st.error(traceback.format_exc()) def load_data(uploaded_file): try: if uploaded_file.name.endswith('.csv'): df = pd.read_csv(uploaded_file) elif uploaded_file.name.endswith('.xlsx'): df = pd.read_excel(uploaded_file) st.write("## Dataset") st.write(df.head()) st.session_state['dataframe'] = df except Exception as e: handle_exception(e) def load_pycaret_dataset(dataset_name): try: df = get_data(dataset_name) st.write("## Dataset") st.write(df.head()) st.session_state['dataframe'] = df except Exception as e: handle_exception(e) def eda_report(): if 'dataframe' in st.session_state: df = st.session_state['dataframe'] col1,col2 = st.columns([0.6,0.4]) new_report = col1.toggle(":blue[Generate New]", value=True) show_button = col2.button("Show Report") pb = st.progress(0, text="Generating Report") cont = st.container(border=False) try: if show_button: if new_report: update_progress(pb,1,4) data_profile(df, cont) update_progress(pb,2,4) display_sweetviz_report(df, cont) update_progress(pb,4,4) else: show_profile_reports(cont) except Exception as e: handle_exception(e) def build_model(task, container): if 'dataframe' in st.session_state: df = st.session_state['dataframe'] feature_expander = container.expander("Select Columns") target_column = feature_expander.selectbox("Select target column", df.columns) if task in ["Classification", "Regression", "Time Series Forecasting"] else None numerical_columns = feature_expander.multiselect("Select numerical columns", df.columns) categorical_columns = feature_expander.multiselect("Select categorical columns", df.columns) params_expander = container.expander("Tune Parameters") # Data Preparation handle_missing_data = params_expander.toggle("Handle Missing Data", value=True) handle_outliers = params_expander.toggle("Handle Outliers", value=True) # Scale and Transform normalize = params_expander.checkbox("Normalize", value=False) normalize_method = params_expander.selectbox("Normalize Method", ["zscore", "minmax", "maxabs", "robust"], index=0 if normalize else -1) if normalize else None transformation = params_expander.checkbox("Apply Transformation", value=False) transformation_method = params_expander.selectbox("Transformation Method", ["yeo-johnson", "quantile"], index=0 if transformation else -1) if transformation else None # Feature Engineering polynomial_features = params_expander.checkbox("Polynomial Features", value=False) polynomial_degree = params_expander.slider("Polynomial Degree", 2, 5, 2) if polynomial_features else None # Feature Selection remove_multicollinearity = params_expander.checkbox("Remove Multicollinearity", value=False) multicollinearity_threshold = params_expander.slider("Multicollinearity Threshold", 0.5, 1.0, 0.9) if remove_multicollinearity else None if not (task == "Anomaly Detection" or task == "Clustering") : feature_selection = params_expander.checkbox("Feature Selection", value=False) feature_selection_method = params_expander.selectbox("Feature Selection Method", ["classic", "exhaustive"], index=0 if feature_selection else -1) if feature_selection else None else: feature_selection = None feature_selection_method = None try: # Setup arguments for PyCaret setup_kwargs = { 'data': df[numerical_columns + categorical_columns + ([target_column] if target_column else [])], 'categorical_features': categorical_columns, 'numeric_features': numerical_columns, 'target': target_column, 'preprocess': handle_missing_data, 'remove_outliers': handle_outliers, 'normalize': normalize, 'normalize_method': normalize_method, 'transformation': transformation, 'transformation_method': transformation_method, 'polynomial_features': polynomial_features, 'polynomial_degree': polynomial_degree, 'remove_multicollinearity': remove_multicollinearity, 'multicollinearity_threshold': multicollinearity_threshold, 'feature_selection': feature_selection, 'feature_selection_method': feature_selection_method } pb = st.progress(0, text="Building Model...") if task == "Classification" and st.button("Run Classification"): df[target_column] = df[target_column].astype('category') df.dropna(subset=[target_column] + numerical_columns + categorical_columns, inplace=True) if len(df) < 2: st.error("Not enough data to split into train and test sets.") return update_progress(pb,1,7) exp = cls_setup(**setup_kwargs) update_progress(pb,2,7) best_model = cls_compare() update_progress(pb,3,7) st.dataframe(cls_pull()) update_progress(pb,4,7) cls_plot(best_model, plot='auc',display_format="streamlit") cls_plot(best_model, plot='confusion_matrix',display_format="streamlit") update_progress(pb,5,7) st.image(cls_plot(best_model, plot='pr',save=True)) update_progress(pb,6,7) cls_save(best_model, 'best_classification_model') st.write('Best Model based on metrics - ') st.write(best_model) update_progress(pb,7,7) elif task == "Regression" and st.button("Run Regression"): update_progress(pb,1,7) df[target_column] = pd.to_numeric(df[target_column], errors='coerce') update_progress(pb,2,7) df.dropna(subset=[target_column] + numerical_columns + categorical_columns, inplace=True) update_progress(pb,3,7) if len(df) < 2: st.error("Not enough data to split into train and test sets.") return exp = reg_setup(**setup_kwargs) best_model = reg_compare() update_progress(pb,4,7) st.dataframe(reg_pull()) update_progress(pb,5,7) st.image(reg_plot(best_model, plot='residuals', save=True)) st.image(reg_plot(best_model, plot='error', save=True)) st.image(reg_plot(best_model, plot='error', save=True)) update_progress(pb,6,7) reg_save(best_model, 'best_regression_model') st.write('Best Model based on metrics - ') st.write(best_model) update_progress(pb,7,7) elif task == "Clustering" and st.button("Run Clustering"): update_progress(pb,1,7) df.dropna(subset=numerical_columns + categorical_columns, inplace=True) update_progress(pb,2,7) setup_kwargs.pop('target') setup_kwargs.pop('feature_selection') setup_kwargs.pop('feature_selection_method') update_progress(pb,3,7) exp = clu_setup(**setup_kwargs) best_model = clu_create('kmeans') update_progress(pb,4,7) clu_plot(best_model, plot='cluster', display_format='streamlit') clu_plot(best_model, plot='elbow', display_format='streamlit') update_progress(pb,5,7) st.write(best_model) st.dataframe(clu_pull()) update_progress(pb,6,7) clu_save(best_model, 'best_clustering_model') st.write('Best Model based on metrics - ') st.write(best_model) update_progress(pb,7,7) elif task == "Anomaly Detection" and st.button("Run Anomaly Detection"): update_progress(pb,1,7) df.dropna(subset=numerical_columns + categorical_columns, inplace=True) update_progress(pb,2,7) setup_kwargs.pop('target') setup_kwargs.pop('feature_selection') setup_kwargs.pop('feature_selection_method') update_progress(pb,3,7) exp = ano_setup(**setup_kwargs) best_model = ano_create('iforest') update_progress(pb,4,7) ano_plot(best_model, plot='tsne', display_format='streamlit') update_progress(pb,5,7) st.write(best_model) st.dataframe(ano_pull()) update_progress(pb,6,7) ano_save(best_model, 'best_anomaly_model') st.write('Best Model based on metrics - ') st.write(best_model) update_progress(pb,7,7) elif task == "Time Series Forecasting" : date_column = feature_expander.selectbox("Select date column", df.columns) if st.button("Run Time Series Forecasting"): update_progress(pb,1,5) df[date_column] = pd.to_datetime(df[date_column]) df[target_column] = pd.to_numeric(df[target_column], errors='coerce') df.dropna(subset=[target_column], inplace=True) update_progress(pb,2,5) df = df.set_index(date_column).asfreq('D') exp = ts_setup(df, target=target_column, numeric_imputation_target='mean', numeric_imputation_exogenous='mean') best_model = ts_compare() update_progress(pb,3,5) st.dataframe(ts_pull()) ts_plot(best_model, plot='forecast', display_format="streamlit") ts_save(best_model, 'best_timeseries_model') update_progress(pb,4,5) st.write('Best Model based on metrics - ') st.write(best_model) update_progress(pb,5,5) except Exception as e: handle_exception(e) def download_model(task): model_file = None if task == "Classification": model_file = 'best_classification_model.pkl' elif task == "Regression": model_file = 'best_regression_model.pkl' elif task == "Clustering": model_file = 'best_clustering_model.pkl' elif task == "Anomaly Detection": model_file = 'best_anomaly_model.pkl' elif task == "Time Series Forecasting": model_file = 'best_timeseries_model.pkl' if model_file: if os.path.exists(model_file): try: with open(model_file, 'rb') as f: st.download_button('Download Model', f, file_name=model_file) except Exception as e: handle_exception(e) else: st.error("❗No File Found | First Build A ML Model ")