Spaces:

bitbotcoder
/

automl

Paused

automl / ml_pipeline.py

bitbotcoder

alpha1

7f45a59 12 months ago

14.5 kB

	import streamlit as st
	import pandas as pd
	import sweetviz as sv
	from pycaret.classification import setup as cls_setup, compare_models as cls_compare, save_model as cls_save, pull as cls_pull, plot_model as cls_plot
	from pycaret.regression import setup as reg_setup, compare_models as reg_compare, save_model as reg_save, pull as reg_pull, plot_model as reg_plot
	from pycaret.clustering import setup as clu_setup, create_model as clu_create, plot_model as clu_plot, save_model as clu_save, pull as clu_pull
	from pycaret.anomaly import setup as ano_setup, create_model as ano_create, plot_model as ano_plot, save_model as ano_save, pull as ano_pull
	from pycaret.time_series import setup as ts_setup, compare_models as ts_compare, save_model as ts_save, pull as ts_pull, plot_model as ts_plot
	from pycaret.datasets import get_data
	import streamlit.components.v1 as components
	import traceback
	from ydata_profiling import ProfileReport
	import os



	def get_all_datasets():
	df = get_data('index')
	return df['Dataset'].to_list()

	def show_profile_reports(container):
	if os.path.exists("profile_report.html"):
	with open('profile_report.html', 'r') as f:
	html_content = f.read()
	with container:
	components.html(html_content, height=800, scrolling=True)
	if os.path.exists("sweetviz_report.html"):
	with open('sweetviz_report.html', 'r') as f:
	html_content = f.read()
	with container:
	components.html(html_content, height=800, scrolling=True)

	def data_profile(df,container):
	profile = ProfileReport(df)
	profile.to_file("profile_report.html")
	with open('profile_report.html', 'r') as f:
	html_content = f.read()
	with container:
	components.html(html_content, height=800, scrolling=True)

	def update_progress(progress_bar, step, max_steps):
	progress = int((step / max_steps) * 100)
	t = f"Processing....Step {step}/{max_steps}"
	if step == max_steps:
	t="Process Completed"
	progress_bar.progress(progress, text=t)

	def display_sweetviz_report(dataframe,container):
	report = sv.analyze(dataframe)
	report.show_html('sweetviz_report.html', open_browser=False)
	with open('sweetviz_report.html', 'r') as f:
	html_content = f.read()
	with container:
	components.html(html_content, height=800, scrolling=True)

	def handle_exception(e):
	st.error(
	f"""The app has encountered an error:
	{e}
	Please check settings - columns selections and model parameters
	Or
	Create an issue [here](https://github.com/bitbotcoder/mlwiz/issues/new) with the below error details
	""",
	icon="🥺",
	)
	with st.expander("See Error details"):
	st.error(traceback.format_exc())

	def load_data(uploaded_file):
	try:
	if uploaded_file.name.endswith('.csv'):
	df = pd.read_csv(uploaded_file)
	elif uploaded_file.name.endswith('.xlsx'):
	df = pd.read_excel(uploaded_file)
	st.write("## Dataset")
	st.write(df.head())
	st.session_state['dataframe'] = df
	except Exception as e:
	handle_exception(e)

	def load_pycaret_dataset(dataset_name):
	try:
	df = get_data(dataset_name)
	st.write("## Dataset")
	st.write(df.head())
	st.session_state['dataframe'] = df
	except Exception as e:
	handle_exception(e)


	def eda_report():
	if 'dataframe' in st.session_state:
	df = st.session_state['dataframe']
	col1,col2 = st.columns([0.6,0.4])
	new_report = col1.toggle(":blue[Generate New]", value=True)
	show_button = col2.button("Show Report")
	pb = st.progress(0, text="Generating Report")
	cont = st.container(border=False)
	try:
	if show_button:
	if new_report:
	update_progress(pb,1,4)
	data_profile(df, cont)
	update_progress(pb,2,4)
	display_sweetviz_report(df, cont)
	update_progress(pb,4,4)
	else:
	show_profile_reports(cont)

	except Exception as e:
	handle_exception(e)


	def build_model(task, container):

	if 'dataframe' in st.session_state:
	df = st.session_state['dataframe']
	feature_expander = container.expander("Select Columns")
	target_column = feature_expander.selectbox("Select target column", df.columns) if task in ["Classification", "Regression", "Time Series Forecasting"] else None
	numerical_columns = feature_expander.multiselect("Select numerical columns", df.columns)
	categorical_columns = feature_expander.multiselect("Select categorical columns", df.columns)

	params_expander = container.expander("Tune Parameters")
	# Data Preparation
	handle_missing_data = params_expander.toggle("Handle Missing Data", value=True)
	handle_outliers = params_expander.toggle("Handle Outliers", value=True)

	# Scale and Transform
	normalize = params_expander.checkbox("Normalize", value=False)
	normalize_method = params_expander.selectbox("Normalize Method", ["zscore", "minmax", "maxabs", "robust"], index=0 if normalize else -1) if normalize else None
	transformation = params_expander.checkbox("Apply Transformation", value=False)
	transformation_method = params_expander.selectbox("Transformation Method", ["yeo-johnson", "quantile"], index=0 if transformation else -1) if transformation else None

	# Feature Engineering
	polynomial_features = params_expander.checkbox("Polynomial Features", value=False)
	polynomial_degree = params_expander.slider("Polynomial Degree", 2, 5, 2) if polynomial_features else None

	# Feature Selection
	remove_multicollinearity = params_expander.checkbox("Remove Multicollinearity", value=False)
	multicollinearity_threshold = params_expander.slider("Multicollinearity Threshold", 0.5, 1.0, 0.9) if remove_multicollinearity else None

	if not (task == "Anomaly Detection" or task == "Clustering") :
	feature_selection = params_expander.checkbox("Feature Selection", value=False)
	feature_selection_method = params_expander.selectbox("Feature Selection Method", ["classic", "exhaustive"], index=0 if feature_selection else -1) if feature_selection else None
	else:
	feature_selection = None
	feature_selection_method = None

	try:
	# Setup arguments for PyCaret
	setup_kwargs = {
	'data': df[numerical_columns + categorical_columns + ([target_column] if target_column else [])],
	'categorical_features': categorical_columns,
	'numeric_features': numerical_columns,
	'target': target_column,
	'preprocess': handle_missing_data,
	'remove_outliers': handle_outliers,
	'normalize': normalize,
	'normalize_method': normalize_method,
	'transformation': transformation,
	'transformation_method': transformation_method,
	'polynomial_features': polynomial_features,
	'polynomial_degree': polynomial_degree,
	'remove_multicollinearity': remove_multicollinearity,
	'multicollinearity_threshold': multicollinearity_threshold,
	'feature_selection': feature_selection,
	'feature_selection_method': feature_selection_method
	}
	pb = st.progress(0, text="Building Model...")

	if task == "Classification" and st.button("Run Classification"):

	df[target_column] = df[target_column].astype('category')

	df.dropna(subset=[target_column] + numerical_columns + categorical_columns, inplace=True)

	if len(df) < 2:
	st.error("Not enough data to split into train and test sets.")
	return
	update_progress(pb,1,7)
	exp = cls_setup(**setup_kwargs)
	update_progress(pb,2,7)
	best_model = cls_compare()
	update_progress(pb,3,7)
	st.dataframe(cls_pull())
	update_progress(pb,4,7)
	cls_plot(best_model, plot='auc',display_format="streamlit")
	cls_plot(best_model, plot='confusion_matrix',display_format="streamlit")
	update_progress(pb,5,7)
	st.image(cls_plot(best_model, plot='pr',save=True))
	update_progress(pb,6,7)
	cls_save(best_model, 'best_classification_model')
	st.write('Best Model based on metrics - ')
	st.write(best_model)
	update_progress(pb,7,7)

	elif task == "Regression" and st.button("Run Regression"):
	update_progress(pb,1,7)
	df[target_column] = pd.to_numeric(df[target_column], errors='coerce')
	update_progress(pb,2,7)
	df.dropna(subset=[target_column] + numerical_columns + categorical_columns, inplace=True)
	update_progress(pb,3,7)
	if len(df) < 2:
	st.error("Not enough data to split into train and test sets.")
	return

	exp = reg_setup(**setup_kwargs)
	best_model = reg_compare()
	update_progress(pb,4,7)
	st.dataframe(reg_pull())
	update_progress(pb,5,7)
	st.image(reg_plot(best_model, plot='residuals', save=True))
	st.image(reg_plot(best_model, plot='error', save=True))
	st.image(reg_plot(best_model, plot='error', save=True))
	update_progress(pb,6,7)
	reg_save(best_model, 'best_regression_model')
	st.write('Best Model based on metrics - ')
	st.write(best_model)
	update_progress(pb,7,7)
	elif task == "Clustering" and st.button("Run Clustering"):
	update_progress(pb,1,7)
	df.dropna(subset=numerical_columns + categorical_columns, inplace=True)
	update_progress(pb,2,7)
	setup_kwargs.pop('target')
	setup_kwargs.pop('feature_selection')
	setup_kwargs.pop('feature_selection_method')
	update_progress(pb,3,7)
	exp = clu_setup(**setup_kwargs)
	best_model = clu_create('kmeans')
	update_progress(pb,4,7)
	clu_plot(best_model, plot='cluster', display_format='streamlit')
	clu_plot(best_model, plot='elbow', display_format='streamlit')
	update_progress(pb,5,7)
	st.write(best_model)
	st.dataframe(clu_pull())
	update_progress(pb,6,7)
	clu_save(best_model, 'best_clustering_model')
	st.write('Best Model based on metrics - ')
	st.write(best_model)
	update_progress(pb,7,7)

	elif task == "Anomaly Detection" and st.button("Run Anomaly Detection"):
	update_progress(pb,1,7)
	df.dropna(subset=numerical_columns + categorical_columns, inplace=True)
	update_progress(pb,2,7)
	setup_kwargs.pop('target')
	setup_kwargs.pop('feature_selection')
	setup_kwargs.pop('feature_selection_method')
	update_progress(pb,3,7)
	exp = ano_setup(**setup_kwargs)
	best_model = ano_create('iforest')
	update_progress(pb,4,7)
	ano_plot(best_model, plot='tsne', display_format='streamlit')
	update_progress(pb,5,7)
	st.write(best_model)
	st.dataframe(ano_pull())
	update_progress(pb,6,7)
	ano_save(best_model, 'best_anomaly_model')
	st.write('Best Model based on metrics - ')
	st.write(best_model)
	update_progress(pb,7,7)
	elif task == "Time Series Forecasting" :
	date_column = feature_expander.selectbox("Select date column", df.columns)
	if st.button("Run Time Series Forecasting"):
	update_progress(pb,1,5)
	df[date_column] = pd.to_datetime(df[date_column])
	df[target_column] = pd.to_numeric(df[target_column], errors='coerce')
	df.dropna(subset=[target_column], inplace=True)
	update_progress(pb,2,5)
	df = df.set_index(date_column).asfreq('D')
	exp = ts_setup(df, target=target_column, numeric_imputation_target='mean', numeric_imputation_exogenous='mean')
	best_model = ts_compare()
	update_progress(pb,3,5)
	st.dataframe(ts_pull())
	ts_plot(best_model, plot='forecast', display_format="streamlit")
	ts_save(best_model, 'best_timeseries_model')
	update_progress(pb,4,5)
	st.write('Best Model based on metrics - ')
	st.write(best_model)
	update_progress(pb,5,5)
	except Exception as e:
	handle_exception(e)

	def download_model(task):
	model_file = None
	if task == "Classification":
	model_file = 'best_classification_model.pkl'
	elif task == "Regression":
	model_file = 'best_regression_model.pkl'
	elif task == "Clustering":
	model_file = 'best_clustering_model.pkl'
	elif task == "Anomaly Detection":
	model_file = 'best_anomaly_model.pkl'
	elif task == "Time Series Forecasting":
	model_file = 'best_timeseries_model.pkl'

	if model_file:
	if os.path.exists(model_file):
	try:
	with open(model_file, 'rb') as f:
	st.download_button('Download Model', f, file_name=model_file)
	except Exception as e:
	handle_exception(e)
	else:
	st.error("❗No File Found \| First Build A ML Model ")