| import pandas as pd |
| import numpy as np |
| |
| from sklearn.feature_selection import mutual_info_classif,chi2 |
| from sklearn.feature_selection import SelectKBest, SelectPercentile |
| from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor |
| from sklearn.metrics import roc_auc_score, mean_squared_error |
|
|
| |
|
|
| def constant_feature_detect(data,threshold=0.98): |
| """ detect features that show the same value for the |
| majority/all of the observations (constant/quasi-constant features) |
| |
| Parameters |
| ---------- |
| data : pd.Dataframe |
| threshold : threshold to identify the variable as constant |
| |
| Returns |
| ------- |
| list of variables names |
| """ |
| |
| data_copy = data.copy(deep=True) |
| quasi_constant_feature = [] |
| for feature in data_copy.columns: |
| predominant = (data_copy[feature].value_counts() / np.float( |
| len(data_copy))).sort_values(ascending=False).values[0] |
| if predominant >= threshold: |
| quasi_constant_feature.append(feature) |
| print(len(quasi_constant_feature),' variables are found to be almost constant') |
| return quasi_constant_feature |
|
|
|
|
| def corr_feature_detect(data,threshold=0.8): |
| """ detect highly-correlated features of a Dataframe |
| Parameters |
| ---------- |
| data : pd.Dataframe |
| threshold : threshold to identify the variable correlated |
| |
| Returns |
| ------- |
| pairs of correlated variables |
| """ |
| |
| corrmat = data.corr() |
| corrmat = corrmat.abs().unstack() |
| corrmat = corrmat.sort_values(ascending=False) |
| corrmat = corrmat[corrmat >= threshold] |
| corrmat = corrmat[corrmat < 1] |
| corrmat = pd.DataFrame(corrmat).reset_index() |
| corrmat.columns = ['feature1', 'feature2', 'corr'] |
| |
| grouped_feature_ls = [] |
| correlated_groups = [] |
| |
| for feature in corrmat.feature1.unique(): |
| if feature not in grouped_feature_ls: |
| |
| |
| correlated_block = corrmat[corrmat.feature1 == feature] |
| grouped_feature_ls = grouped_feature_ls + list( |
| correlated_block.feature2.unique()) + [feature] |
| |
| |
| correlated_groups.append(correlated_block) |
| return correlated_groups |
|
|
|
|
| def mutual_info(X,y,select_k=10): |
| |
| |
| |
| |
| |
| |
| if select_k >= 1: |
| sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y) |
| col = X.columns[sel_.get_support()] |
| |
| elif 0 < select_k < 1: |
| sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y) |
| col = X.columns[sel_.get_support()] |
| |
| else: |
| raise ValueError("select_k must be a positive number") |
| |
| return col |
| |
|
|
| |
| def chi_square_test(X,y,select_k=10): |
| |
| """ |
| Compute chi-squared stats between each non-negative feature and class. |
| This score should be used to evaluate categorical variables in a classification task |
| """ |
| if select_k >= 1: |
| sel_ = SelectKBest(chi2, k=select_k).fit(X,y) |
| col = X.columns[sel_.get_support()] |
| elif 0 < select_k < 1: |
| sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y) |
| col = X.columns[sel_.get_support()] |
| else: |
| raise ValueError("select_k must be a positive number") |
| |
| return col |
| |
|
|
| def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold): |
| |
| """ |
| First, it builds one decision tree per feature, to predict the target |
| Second, it makes predictions using the decision tree and the mentioned feature |
| Third, it ranks the features according to the machine learning metric (roc-auc or mse) |
| It selects the highest ranked features |
| |
| """ |
| roc_values = [] |
| for feature in X_train.columns: |
| clf = DecisionTreeClassifier() |
| clf.fit(X_train[feature].to_frame(), y_train) |
| y_scored = clf.predict_proba(X_test[feature].to_frame()) |
| roc_values.append(roc_auc_score(y_test, y_scored[:, 1])) |
| roc_values = pd.Series(roc_values) |
| roc_values.index = X_train.columns |
| print(roc_values.sort_values(ascending=False)) |
| print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns)) |
| keep_col = roc_values[roc_values > threshold] |
| return keep_col |
| |
| |
| def univariate_mse(X_train,y_train,X_test,y_test,threshold): |
| |
| """ |
| First, it builds one decision tree per feature, to predict the target |
| Second, it makes predictions using the decision tree and the mentioned feature |
| Third, it ranks the features according to the machine learning metric (roc-auc or mse) |
| It selects the highest ranked features |
| |
| """ |
| mse_values = [] |
| for feature in X_train.columns: |
| clf = DecisionTreeRegressor() |
| clf.fit(X_train[feature].to_frame(), y_train) |
| y_scored = clf.predict(X_test[feature].to_frame()) |
| mse_values.append(mean_squared_error(y_test, y_scored)) |
| mse_values = pd.Series(mse_values) |
| mse_values.index = X_train.columns |
| print(mse_values.sort_values(ascending=False)) |
| print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns)) |
| keep_col = mse_values[mse_values > threshold] |
| return keep_col |
| |