|
""" |
|
Creators: Diego Medeiros e Reyne Jasson |
|
create a pipeline for building a logistic regression model |
|
and study how does the corona virus changed the sucess |
|
on school. |
|
""" |
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
|
|
from sklearn.linear_model import LogisticRegression |
|
|
|
from sklearn.preprocessing import LabelEncoder |
|
from sklearn.preprocessing import OneHotEncoder |
|
from sklearn.preprocessing import MinMaxScaler |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
from sklearn.metrics import confusion_matrix |
|
|
|
from sklearn.pipeline import Pipeline, FeatureUnion |
|
|
|
|
|
from sklearn.neighbors import LocalOutlierFactor |
|
from sklearn.base import BaseEstimator, TransformerMixin |
|
|
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
from sklearn.impute import SimpleImputer |
|
import pandas as pd |
|
import numpy as np |
|
|
|
from joblib import dump |
|
|
|
import argparse |
|
|
|
from clearml import Task |
|
|
|
|
|
|
|
class FeatureSelector( BaseEstimator, TransformerMixin ): |
|
|
|
def __init__( self, feature_names ): |
|
self.feature_names = feature_names |
|
|
|
|
|
def fit( self, X, y = None ): |
|
return self |
|
|
|
|
|
def transform( self, X, y = None ): |
|
return X[ self.feature_names ] |
|
|
|
|
|
class CategoricalTransformer( BaseEstimator, TransformerMixin ): |
|
|
|
def __init__(self, new_features=True, colnames=None): |
|
self.new_features = new_features |
|
self.colnames = colnames |
|
|
|
|
|
def fit( self, X, y = None ): |
|
return self |
|
|
|
def get_feature_names(self): |
|
return self.colnames.tolist() |
|
|
|
|
|
def transform(self, X , y = None): |
|
df = pd.DataFrame(X,columns=self.colnames) |
|
|
|
columns = self.colnames |
|
|
|
|
|
df['grau_academico'].replace({'BACHARELADO':'3', 'LICENCIATURA':'2', |
|
'TECNOLÓGICO':'1',"OUTRO":"0"},inplace=True) |
|
|
|
|
|
print(df.head()) |
|
|
|
return df |
|
|
|
class NumericalTransformer( BaseEstimator, TransformerMixin ): |
|
|
|
|
|
|
|
|
|
def __init__(self, model = 0, colnames=None): |
|
self.model = model |
|
self.colnames = colnames |
|
|
|
|
|
def fit( self, X, y = None ): |
|
return self |
|
|
|
|
|
def get_feature_names(self): |
|
return self.colnames |
|
|
|
|
|
def transform(self, X , y = None ): |
|
df = pd.DataFrame(X,columns=self.colnames) |
|
|
|
for coluna in self.colnames: |
|
df[coluna] = pd.to_numeric(df[coluna],errors='coerce') |
|
|
|
df.fillna(value=0,inplace=True) |
|
self.colnames = df.columns.tolist() |
|
|
|
df['idade'] = 2020 - df['ano_nascimento'].astype(int) |
|
|
|
|
|
if self.model == 0: |
|
scaler = MinMaxScaler() |
|
|
|
df = scaler.fit_transform(df) |
|
|
|
elif self.model == 1: |
|
scaler = StandardScaler() |
|
|
|
df = scaler.fit_transform(df) |
|
else: |
|
df = df.values |
|
|
|
return df |
|
|
|
def process_args(ARGS:dict,task:Task): |
|
|
|
logger = task.get_logger() |
|
|
|
preprocessed_data_task = Task.get_task(task_id=ARGS.task_id) |
|
|
|
local_csv = preprocessed_data_task.artifacts[ARGS.dataset_name].get_local_copy() |
|
|
|
|
|
data = pd.read_csv(local_csv,encoding='utf-8',sep=',',dtype=object) |
|
|
|
|
|
data['ano_nascimento'].fillna(value='2000',inplace=True) |
|
|
|
|
|
data['ano_nascimento'] = data['ano_nascimento'].astype(int) |
|
data['renda'] = data['renda'].astype(int) |
|
|
|
print("Spliting data into train/val") |
|
|
|
|
|
|
|
data['local_ou_de_fora'] = (data['estado_origem']==('Rio Grande do Norte')) |
|
|
|
|
|
data['raca'].fillna(value='Não Informado',inplace=True) |
|
data['area_conhecimento'].fillna(value='Outra',inplace=True) |
|
data['grau_academico'].fillna(value='OUTRO',inplace=True) |
|
|
|
|
|
|
|
|
|
data.drop(columns={'estado_origem','cidade_origem'},inplace=True) |
|
|
|
data['descricao'].replace({'APROVADO':'1',"FALHOU":"0","REPROVADO POR NOTA E FALTA":"0"},inplace=True) |
|
data['descricao'] = pd.to_numeric(data['descricao'],errors='coerce') |
|
data['descricao'].fillna(value='0',inplace=True) |
|
|
|
print(data.dtypes) |
|
|
|
x_train,x_val,y_train,y_val = train_test_split(data.drop(columns=['descricao']), |
|
data['descricao'], |
|
test_size=0.2, |
|
random_state=2, |
|
shuffle=True, |
|
stratify = data['descricao'] if ARGS.stratify else None) |
|
|
|
print("x train: {}".format(x_train.shape)) |
|
print("y train: {}".format(y_train.shape)) |
|
print("x val: {}".format(x_val.shape)) |
|
print("y val: {}".format(y_val.shape)) |
|
print("x train: {}".format(list(x_train.columns))) |
|
print("Removal Outliers") |
|
|
|
x = x_train.select_dtypes("int64").copy() |
|
|
|
|
|
lof = LocalOutlierFactor() |
|
outlier = lof.fit_predict(x) |
|
mask = (outlier != -1) |
|
|
|
print("x_train shape [original]: {}".format(x_train.shape)) |
|
print("x_train shape [outlier removal]: {}".format(x_train.loc[mask,:].shape)) |
|
|
|
|
|
x_train = x_train.loc[mask,:].copy() |
|
y_train = y_train[mask].copy() |
|
print("Encoding Target Variable") |
|
|
|
le = LabelEncoder() |
|
|
|
|
|
y_train = le.fit_transform(y_train) |
|
|
|
y_val = le.transform(y_val) |
|
print(y_train) |
|
print("Classes [0, 1]: {}".format(le.inverse_transform([0, 1]))) |
|
|
|
|
|
print("Pipeline generation") |
|
|
|
|
|
categorical_features = x_train.select_dtypes(["object",'bool']).columns.to_list() |
|
|
|
|
|
numerical_features = x_train.select_dtypes("int64").columns.to_list() |
|
|
|
|
|
categorical_pipeline = Pipeline(steps = [('cat_selector',FeatureSelector(categorical_features)), |
|
('imputer_cat', SimpleImputer(strategy="most_frequent")), |
|
('cat_encoder',OneHotEncoder(sparse=False,drop="first")) |
|
] |
|
) |
|
|
|
print(FeatureSelector(numerical_features)) |
|
|
|
numerical_pipeline = Pipeline(steps = [('num_selector', FeatureSelector(numerical_features)), |
|
('imputer_cat', SimpleImputer(strategy="median")), |
|
('num_transformer', NumericalTransformer(colnames=numerical_features)) |
|
] |
|
) |
|
|
|
|
|
|
|
full_pipeline_preprocessing = FeatureUnion(transformer_list = [('cat_pipeline', categorical_pipeline), |
|
('num_pipeline', numerical_pipeline) |
|
] |
|
) |
|
|
|
|
|
pipe = Pipeline(steps = [('full_pipeline', full_pipeline_preprocessing), |
|
("classifier",LogisticRegression()) |
|
] |
|
) |
|
|
|
|
|
print("Training{}".format(list(x_train.dtypes))) |
|
pipe.fit(x_train,y_train) |
|
|
|
|
|
print("Infering") |
|
predict = pipe.predict(x_val) |
|
|
|
print(predict) |
|
|
|
return pipe,x_val,y_val |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
description="The training script", |
|
fromfile_prefix_chars="@", |
|
) |
|
|
|
parser.add_argument( |
|
"--model_export", |
|
type=str, |
|
help="Fully-qualified artifact name for the exported model to clearML", |
|
default='regressao_logistica.joblib' |
|
) |
|
|
|
parser.add_argument( |
|
"--dataset_name", |
|
type=str, |
|
default='processed_data', |
|
help="The dataset name to generate model" |
|
|
|
) |
|
|
|
parser.add_argument( |
|
"--task_id", |
|
type=str, |
|
help="Task ID where the data was generated", |
|
default='71845909e9b643fca92e5902c32265a1' |
|
) |
|
|
|
|
|
parser.add_argument( |
|
"--stratify", |
|
type=int, |
|
help="Name for column which to stratify", |
|
default=None |
|
) |
|
|
|
ARGS = parser.parse_args() |
|
|
|
task = Task.init(project_name="a ML example",task_name="logist training") |
|
|
|
|
|
|
|
|
|
clf,x_val,y_val = process_args(ARGS,task) |
|
|
|
y_predict = clf.predict(x_val) |
|
|
|
|
|
cm = confusion_matrix(y_true=y_val,y_pred=y_predict,normalize='true') |
|
cmap = sns.diverging_palette(10, 240, as_cmap=True) |
|
sns.heatmap(cm,cmap=cmap, annot=True) |
|
plt.show() |
|
|
|
print(f"Exporting model {ARGS.model_export}") |
|
dump(clf, ARGS.model_export) |
|
task.upload_artifact("log_regress_classifier", ARGS.model_export) |
|
|
|
|