automl / ml_pipeline.py
bitbotcoder
alpha1
7f45a59
import streamlit as st
import pandas as pd
import sweetviz as sv
from pycaret.classification import setup as cls_setup, compare_models as cls_compare, save_model as cls_save, pull as cls_pull, plot_model as cls_plot
from pycaret.regression import setup as reg_setup, compare_models as reg_compare, save_model as reg_save, pull as reg_pull, plot_model as reg_plot
from pycaret.clustering import setup as clu_setup, create_model as clu_create, plot_model as clu_plot, save_model as clu_save, pull as clu_pull
from pycaret.anomaly import setup as ano_setup, create_model as ano_create, plot_model as ano_plot, save_model as ano_save, pull as ano_pull
from pycaret.time_series import setup as ts_setup, compare_models as ts_compare, save_model as ts_save, pull as ts_pull, plot_model as ts_plot
from pycaret.datasets import get_data
import streamlit.components.v1 as components
import traceback
from ydata_profiling import ProfileReport
import os
def get_all_datasets():
df = get_data('index')
return df['Dataset'].to_list()
def show_profile_reports(container):
if os.path.exists("profile_report.html"):
with open('profile_report.html', 'r') as f:
html_content = f.read()
with container:
components.html(html_content, height=800, scrolling=True)
if os.path.exists("sweetviz_report.html"):
with open('sweetviz_report.html', 'r') as f:
html_content = f.read()
with container:
components.html(html_content, height=800, scrolling=True)
def data_profile(df,container):
profile = ProfileReport(df)
profile.to_file("profile_report.html")
with open('profile_report.html', 'r') as f:
html_content = f.read()
with container:
components.html(html_content, height=800, scrolling=True)
def update_progress(progress_bar, step, max_steps):
progress = int((step / max_steps) * 100)
t = f"Processing....Step {step}/{max_steps}"
if step == max_steps:
t="Process Completed"
progress_bar.progress(progress, text=t)
def display_sweetviz_report(dataframe,container):
report = sv.analyze(dataframe)
report.show_html('sweetviz_report.html', open_browser=False)
with open('sweetviz_report.html', 'r') as f:
html_content = f.read()
with container:
components.html(html_content, height=800, scrolling=True)
def handle_exception(e):
st.error(
f"""The app has encountered an error:
**{e}**
Please check settings - columns selections and model parameters
Or
Create an issue [here](https://github.com/bitbotcoder/mlwiz/issues/new) with the below error details
""",
icon="🥺",
)
with st.expander("See Error details"):
st.error(traceback.format_exc())
def load_data(uploaded_file):
try:
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file)
elif uploaded_file.name.endswith('.xlsx'):
df = pd.read_excel(uploaded_file)
st.write("## Dataset")
st.write(df.head())
st.session_state['dataframe'] = df
except Exception as e:
handle_exception(e)
def load_pycaret_dataset(dataset_name):
try:
df = get_data(dataset_name)
st.write("## Dataset")
st.write(df.head())
st.session_state['dataframe'] = df
except Exception as e:
handle_exception(e)
def eda_report():
if 'dataframe' in st.session_state:
df = st.session_state['dataframe']
col1,col2 = st.columns([0.6,0.4])
new_report = col1.toggle(":blue[Generate New]", value=True)
show_button = col2.button("Show Report")
pb = st.progress(0, text="Generating Report")
cont = st.container(border=False)
try:
if show_button:
if new_report:
update_progress(pb,1,4)
data_profile(df, cont)
update_progress(pb,2,4)
display_sweetviz_report(df, cont)
update_progress(pb,4,4)
else:
show_profile_reports(cont)
except Exception as e:
handle_exception(e)
def build_model(task, container):
if 'dataframe' in st.session_state:
df = st.session_state['dataframe']
feature_expander = container.expander("Select Columns")
target_column = feature_expander.selectbox("Select target column", df.columns) if task in ["Classification", "Regression", "Time Series Forecasting"] else None
numerical_columns = feature_expander.multiselect("Select numerical columns", df.columns)
categorical_columns = feature_expander.multiselect("Select categorical columns", df.columns)
params_expander = container.expander("Tune Parameters")
# Data Preparation
handle_missing_data = params_expander.toggle("Handle Missing Data", value=True)
handle_outliers = params_expander.toggle("Handle Outliers", value=True)
# Scale and Transform
normalize = params_expander.checkbox("Normalize", value=False)
normalize_method = params_expander.selectbox("Normalize Method", ["zscore", "minmax", "maxabs", "robust"], index=0 if normalize else -1) if normalize else None
transformation = params_expander.checkbox("Apply Transformation", value=False)
transformation_method = params_expander.selectbox("Transformation Method", ["yeo-johnson", "quantile"], index=0 if transformation else -1) if transformation else None
# Feature Engineering
polynomial_features = params_expander.checkbox("Polynomial Features", value=False)
polynomial_degree = params_expander.slider("Polynomial Degree", 2, 5, 2) if polynomial_features else None
# Feature Selection
remove_multicollinearity = params_expander.checkbox("Remove Multicollinearity", value=False)
multicollinearity_threshold = params_expander.slider("Multicollinearity Threshold", 0.5, 1.0, 0.9) if remove_multicollinearity else None
if not (task == "Anomaly Detection" or task == "Clustering") :
feature_selection = params_expander.checkbox("Feature Selection", value=False)
feature_selection_method = params_expander.selectbox("Feature Selection Method", ["classic", "exhaustive"], index=0 if feature_selection else -1) if feature_selection else None
else:
feature_selection = None
feature_selection_method = None
try:
# Setup arguments for PyCaret
setup_kwargs = {
'data': df[numerical_columns + categorical_columns + ([target_column] if target_column else [])],
'categorical_features': categorical_columns,
'numeric_features': numerical_columns,
'target': target_column,
'preprocess': handle_missing_data,
'remove_outliers': handle_outliers,
'normalize': normalize,
'normalize_method': normalize_method,
'transformation': transformation,
'transformation_method': transformation_method,
'polynomial_features': polynomial_features,
'polynomial_degree': polynomial_degree,
'remove_multicollinearity': remove_multicollinearity,
'multicollinearity_threshold': multicollinearity_threshold,
'feature_selection': feature_selection,
'feature_selection_method': feature_selection_method
}
pb = st.progress(0, text="Building Model...")
if task == "Classification" and st.button("Run Classification"):
df[target_column] = df[target_column].astype('category')
df.dropna(subset=[target_column] + numerical_columns + categorical_columns, inplace=True)
if len(df) < 2:
st.error("Not enough data to split into train and test sets.")
return
update_progress(pb,1,7)
exp = cls_setup(**setup_kwargs)
update_progress(pb,2,7)
best_model = cls_compare()
update_progress(pb,3,7)
st.dataframe(cls_pull())
update_progress(pb,4,7)
cls_plot(best_model, plot='auc',display_format="streamlit")
cls_plot(best_model, plot='confusion_matrix',display_format="streamlit")
update_progress(pb,5,7)
st.image(cls_plot(best_model, plot='pr',save=True))
update_progress(pb,6,7)
cls_save(best_model, 'best_classification_model')
st.write('Best Model based on metrics - ')
st.write(best_model)
update_progress(pb,7,7)
elif task == "Regression" and st.button("Run Regression"):
update_progress(pb,1,7)
df[target_column] = pd.to_numeric(df[target_column], errors='coerce')
update_progress(pb,2,7)
df.dropna(subset=[target_column] + numerical_columns + categorical_columns, inplace=True)
update_progress(pb,3,7)
if len(df) < 2:
st.error("Not enough data to split into train and test sets.")
return
exp = reg_setup(**setup_kwargs)
best_model = reg_compare()
update_progress(pb,4,7)
st.dataframe(reg_pull())
update_progress(pb,5,7)
st.image(reg_plot(best_model, plot='residuals', save=True))
st.image(reg_plot(best_model, plot='error', save=True))
st.image(reg_plot(best_model, plot='error', save=True))
update_progress(pb,6,7)
reg_save(best_model, 'best_regression_model')
st.write('Best Model based on metrics - ')
st.write(best_model)
update_progress(pb,7,7)
elif task == "Clustering" and st.button("Run Clustering"):
update_progress(pb,1,7)
df.dropna(subset=numerical_columns + categorical_columns, inplace=True)
update_progress(pb,2,7)
setup_kwargs.pop('target')
setup_kwargs.pop('feature_selection')
setup_kwargs.pop('feature_selection_method')
update_progress(pb,3,7)
exp = clu_setup(**setup_kwargs)
best_model = clu_create('kmeans')
update_progress(pb,4,7)
clu_plot(best_model, plot='cluster', display_format='streamlit')
clu_plot(best_model, plot='elbow', display_format='streamlit')
update_progress(pb,5,7)
st.write(best_model)
st.dataframe(clu_pull())
update_progress(pb,6,7)
clu_save(best_model, 'best_clustering_model')
st.write('Best Model based on metrics - ')
st.write(best_model)
update_progress(pb,7,7)
elif task == "Anomaly Detection" and st.button("Run Anomaly Detection"):
update_progress(pb,1,7)
df.dropna(subset=numerical_columns + categorical_columns, inplace=True)
update_progress(pb,2,7)
setup_kwargs.pop('target')
setup_kwargs.pop('feature_selection')
setup_kwargs.pop('feature_selection_method')
update_progress(pb,3,7)
exp = ano_setup(**setup_kwargs)
best_model = ano_create('iforest')
update_progress(pb,4,7)
ano_plot(best_model, plot='tsne', display_format='streamlit')
update_progress(pb,5,7)
st.write(best_model)
st.dataframe(ano_pull())
update_progress(pb,6,7)
ano_save(best_model, 'best_anomaly_model')
st.write('Best Model based on metrics - ')
st.write(best_model)
update_progress(pb,7,7)
elif task == "Time Series Forecasting" :
date_column = feature_expander.selectbox("Select date column", df.columns)
if st.button("Run Time Series Forecasting"):
update_progress(pb,1,5)
df[date_column] = pd.to_datetime(df[date_column])
df[target_column] = pd.to_numeric(df[target_column], errors='coerce')
df.dropna(subset=[target_column], inplace=True)
update_progress(pb,2,5)
df = df.set_index(date_column).asfreq('D')
exp = ts_setup(df, target=target_column, numeric_imputation_target='mean', numeric_imputation_exogenous='mean')
best_model = ts_compare()
update_progress(pb,3,5)
st.dataframe(ts_pull())
ts_plot(best_model, plot='forecast', display_format="streamlit")
ts_save(best_model, 'best_timeseries_model')
update_progress(pb,4,5)
st.write('Best Model based on metrics - ')
st.write(best_model)
update_progress(pb,5,5)
except Exception as e:
handle_exception(e)
def download_model(task):
model_file = None
if task == "Classification":
model_file = 'best_classification_model.pkl'
elif task == "Regression":
model_file = 'best_regression_model.pkl'
elif task == "Clustering":
model_file = 'best_clustering_model.pkl'
elif task == "Anomaly Detection":
model_file = 'best_anomaly_model.pkl'
elif task == "Time Series Forecasting":
model_file = 'best_timeseries_model.pkl'
if model_file:
if os.path.exists(model_file):
try:
with open(model_file, 'rb') as f:
st.download_button('Download Model', f, file_name=model_file)
except Exception as e:
handle_exception(e)
else:
st.error("❗No File Found | First Build A ML Model ")