Upload 52 files

0ab7b0c verified over 1 year ago

5.6 kB

	import pandas as pd
	import numpy as np
	#from sklearn.feature_selection import VarianceThreshold
	from sklearn.feature_selection import mutual_info_classif,chi2
	from sklearn.feature_selection import SelectKBest, SelectPercentile
	from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
	from sklearn.metrics import roc_auc_score, mean_squared_error

	# 2018.11.17 Created by Eamon.Zhang

	def constant_feature_detect(data,threshold=0.98):
	""" detect features that show the same value for the
	majority/all of the observations (constant/quasi-constant features)

	Parameters
	----------
	data : pd.Dataframe
	threshold : threshold to identify the variable as constant

	Returns
	-------
	list of variables names
	"""

	data_copy = data.copy(deep=True)
	quasi_constant_feature = []
	for feature in data_copy.columns:
	predominant = (data_copy[feature].value_counts() / np.float(
	len(data_copy))).sort_values(ascending=False).values[0]
	if predominant >= threshold:
	quasi_constant_feature.append(feature)
	print(len(quasi_constant_feature),' variables are found to be almost constant')
	return quasi_constant_feature


	def corr_feature_detect(data,threshold=0.8):
	""" detect highly-correlated features of a Dataframe
	Parameters
	----------
	data : pd.Dataframe
	threshold : threshold to identify the variable correlated

	Returns
	-------
	pairs of correlated variables
	"""

	corrmat = data.corr()
	corrmat = corrmat.abs().unstack() # absolute value of corr coef
	corrmat = corrmat.sort_values(ascending=False)
	corrmat = corrmat[corrmat >= threshold]
	corrmat = corrmat[corrmat < 1] # remove the digonal
	corrmat = pd.DataFrame(corrmat).reset_index()
	corrmat.columns = ['feature1', 'feature2', 'corr']

	grouped_feature_ls = []
	correlated_groups = []

	for feature in corrmat.feature1.unique():
	if feature not in grouped_feature_ls:

	# find all features correlated to a single feature
	correlated_block = corrmat[corrmat.feature1 == feature]
	grouped_feature_ls = grouped_feature_ls + list(
	correlated_block.feature2.unique()) + [feature]

	# append the block of features to the list
	correlated_groups.append(correlated_block)
	return correlated_groups


	def mutual_info(X,y,select_k=10):

	# mi = mutual_info_classif(X,y)
	# mi = pd.Series(mi)
	# mi.index = X.columns
	# mi.sort_values(ascending=False)

	if select_k >= 1:
	sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
	col = X.columns[sel_.get_support()]

	elif 0 < select_k < 1:
	sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
	col = X.columns[sel_.get_support()]

	else:
	raise ValueError("select_k must be a positive number")

	return col


	# 2018.11.27 edit Chi-square test
	def chi_square_test(X,y,select_k=10):

	"""
	Compute chi-squared stats between each non-negative feature and class.
	This score should be used to evaluate categorical variables in a classification task
	"""
	if select_k >= 1:
	sel_ = SelectKBest(chi2, k=select_k).fit(X,y)
	col = X.columns[sel_.get_support()]
	elif 0 < select_k < 1:
	sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y)
	col = X.columns[sel_.get_support()]
	else:
	raise ValueError("select_k must be a positive number")

	return col


	def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold):

	"""
	First, it builds one decision tree per feature, to predict the target
	Second, it makes predictions using the decision tree and the mentioned feature
	Third, it ranks the features according to the machine learning metric (roc-auc or mse)
	It selects the highest ranked features

	"""
	roc_values = []
	for feature in X_train.columns:
	clf = DecisionTreeClassifier()
	clf.fit(X_train[feature].to_frame(), y_train)
	y_scored = clf.predict_proba(X_test[feature].to_frame())
	roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
	roc_values = pd.Series(roc_values)
	roc_values.index = X_train.columns
	print(roc_values.sort_values(ascending=False))
	print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
	keep_col = roc_values[roc_values > threshold]
	return keep_col


	def univariate_mse(X_train,y_train,X_test,y_test,threshold):

	"""
	First, it builds one decision tree per feature, to predict the target
	Second, it makes predictions using the decision tree and the mentioned feature
	Third, it ranks the features according to the machine learning metric (roc-auc or mse)
	It selects the highest ranked features

	"""
	mse_values = []
	for feature in X_train.columns:
	clf = DecisionTreeRegressor()
	clf.fit(X_train[feature].to_frame(), y_train)
	y_scored = clf.predict(X_test[feature].to_frame())
	mse_values.append(mean_squared_error(y_test, y_scored))
	mse_values = pd.Series(mse_values)
	mse_values.index = X_train.columns
	print(mse_values.sort_values(ascending=False))
	print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
	keep_col = mse_values[mse_values > threshold]
	return keep_col