| | import pandas as pd |
| | import numpy as np |
| | |
| | from sklearn.feature_selection import mutual_info_classif,chi2 |
| | from sklearn.feature_selection import SelectKBest, SelectPercentile |
| | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor |
| | from sklearn.metrics import roc_auc_score, mean_squared_error |
| |
|
| | |
| |
|
| | def constant_feature_detect(data,threshold=0.98): |
| | """ detect features that show the same value for the |
| | majority/all of the observations (constant/quasi-constant features) |
| | |
| | Parameters |
| | ---------- |
| | data : pd.Dataframe |
| | threshold : threshold to identify the variable as constant |
| | |
| | Returns |
| | ------- |
| | list of variables names |
| | """ |
| | |
| | data_copy = data.copy(deep=True) |
| | quasi_constant_feature = [] |
| | for feature in data_copy.columns: |
| | predominant = (data_copy[feature].value_counts() / np.float( |
| | len(data_copy))).sort_values(ascending=False).values[0] |
| | if predominant >= threshold: |
| | quasi_constant_feature.append(feature) |
| | print(len(quasi_constant_feature),' variables are found to be almost constant') |
| | return quasi_constant_feature |
| |
|
| |
|
| | def corr_feature_detect(data,threshold=0.8): |
| | """ detect highly-correlated features of a Dataframe |
| | Parameters |
| | ---------- |
| | data : pd.Dataframe |
| | threshold : threshold to identify the variable correlated |
| | |
| | Returns |
| | ------- |
| | pairs of correlated variables |
| | """ |
| | |
| | corrmat = data.corr() |
| | corrmat = corrmat.abs().unstack() |
| | corrmat = corrmat.sort_values(ascending=False) |
| | corrmat = corrmat[corrmat >= threshold] |
| | corrmat = corrmat[corrmat < 1] |
| | corrmat = pd.DataFrame(corrmat).reset_index() |
| | corrmat.columns = ['feature1', 'feature2', 'corr'] |
| | |
| | grouped_feature_ls = [] |
| | correlated_groups = [] |
| | |
| | for feature in corrmat.feature1.unique(): |
| | if feature not in grouped_feature_ls: |
| | |
| | |
| | correlated_block = corrmat[corrmat.feature1 == feature] |
| | grouped_feature_ls = grouped_feature_ls + list( |
| | correlated_block.feature2.unique()) + [feature] |
| | |
| | |
| | correlated_groups.append(correlated_block) |
| | return correlated_groups |
| |
|
| |
|
| | def mutual_info(X,y,select_k=10): |
| | |
| | |
| | |
| | |
| | |
| | |
| | if select_k >= 1: |
| | sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y) |
| | col = X.columns[sel_.get_support()] |
| | |
| | elif 0 < select_k < 1: |
| | sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y) |
| | col = X.columns[sel_.get_support()] |
| | |
| | else: |
| | raise ValueError("select_k must be a positive number") |
| | |
| | return col |
| | |
| |
|
| | |
| | def chi_square_test(X,y,select_k=10): |
| | |
| | """ |
| | Compute chi-squared stats between each non-negative feature and class. |
| | This score should be used to evaluate categorical variables in a classification task |
| | """ |
| | if select_k >= 1: |
| | sel_ = SelectKBest(chi2, k=select_k).fit(X,y) |
| | col = X.columns[sel_.get_support()] |
| | elif 0 < select_k < 1: |
| | sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y) |
| | col = X.columns[sel_.get_support()] |
| | else: |
| | raise ValueError("select_k must be a positive number") |
| | |
| | return col |
| | |
| |
|
| | def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold): |
| | |
| | """ |
| | First, it builds one decision tree per feature, to predict the target |
| | Second, it makes predictions using the decision tree and the mentioned feature |
| | Third, it ranks the features according to the machine learning metric (roc-auc or mse) |
| | It selects the highest ranked features |
| | |
| | """ |
| | roc_values = [] |
| | for feature in X_train.columns: |
| | clf = DecisionTreeClassifier() |
| | clf.fit(X_train[feature].to_frame(), y_train) |
| | y_scored = clf.predict_proba(X_test[feature].to_frame()) |
| | roc_values.append(roc_auc_score(y_test, y_scored[:, 1])) |
| | roc_values = pd.Series(roc_values) |
| | roc_values.index = X_train.columns |
| | print(roc_values.sort_values(ascending=False)) |
| | print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns)) |
| | keep_col = roc_values[roc_values > threshold] |
| | return keep_col |
| | |
| | |
| | def univariate_mse(X_train,y_train,X_test,y_test,threshold): |
| | |
| | """ |
| | First, it builds one decision tree per feature, to predict the target |
| | Second, it makes predictions using the decision tree and the mentioned feature |
| | Third, it ranks the features according to the machine learning metric (roc-auc or mse) |
| | It selects the highest ranked features |
| | |
| | """ |
| | mse_values = [] |
| | for feature in X_train.columns: |
| | clf = DecisionTreeRegressor() |
| | clf.fit(X_train[feature].to_frame(), y_train) |
| | y_scored = clf.predict(X_test[feature].to_frame()) |
| | mse_values.append(mean_squared_error(y_test, y_scored)) |
| | mse_values = pd.Series(mse_values) |
| | mse_values.index = X_train.columns |
| | print(mse_values.sort_values(ascending=False)) |
| | print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns)) |
| | keep_col = mse_values[mse_values > threshold] |
| | return keep_col |
| | |