data_path = '/home/naji/Desktop/github-repos/machine-learning/nbs/0-datasets/data/'Feature Selection
from pandas import read_csv
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_scoreSelect Categorical Input Features
# load the dataset
def load_data(filename):
df = read_csv(data_path + filename, header=None)
data = df.values
X , y = data[:, :-1], data[:, -1]
X = X.astype(str)
return X, y# prepare input data
def prepare_inputs(X_train, X_test):
ordenc = OrdinalEncoder()
ordenc.fit(X_train)
X_train_enc = ordenc.transform(X_train)
X_test_enc = ordenc.transform(X_test)
return X_train_enc, X_test_encdef prepare_targets(y_train, y_test):
labenc = LabelEncoder()
labenc.fit(y_train)
y_train_enc = labenc.transform(y_train)
y_test_enc = labenc.transform(y_test)
return y_train_enc, y_test_encBreast Cancer Categorical Dataset
df = read_csv(data_path + 'breast-cancer.csv', header=None)
df.head(10)X, y = load_data('breast-cancer.csv')X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)y_train_enc, y_test_enc = prepare_targets(y_train, y_test)X.shape, y.shapeX[:5, :]y[:5]X_train_enc.shape, X_test_enc.shapeX_train_enc[:5, :]Model Built Using All Features
model = LogisticRegression(solver='lbfgs')model.fit(X_train_enc, y_train_enc)yhat = model.predict(X_test_enc)accuracy = accuracy_score(y_test_enc, yhat)print(f'Accuracy: {accuracy*100:.2f}%')Categorical Feature Selection
Chi-Squared Feature Selection
from sklearn.feature_selection import SelectKBest, chi2def select_features(X_train, y_train, X_test, k='all'):
fs = SelectKBest(score_func=chi2, k=k)
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fsX, y = load_data('breast-cancer.csv')X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)y_train_enc, y_test_enc = prepare_targets(y_train, y_test)X_train_fs, X_test_fs, fs = select_features(X_train_enc, y_train_enc, X_test_enc)for i in range(len(fs.scores_)):
print(f'Features {i}: {fs.scores_[i]:.3f}')pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()Model Built Using Chi-Squared Features
model = LogisticRegression(solver='lbfgs')model.fit(X_train_fs, y_train_enc)yhat = model.predict(X_test_fs)accuracy = accuracy_score(y_test_enc, yhat)print(f'Accuracy: {accuracy*100:.3f}%')Mutual Information Feature Selection
from sklearn.feature_selection import SelectKBest, mutual_info_classifdef select_features(X_train, y_train, X_test, k='all'):
fs = SelectKBest(score_func=mutual_info_classif, k=k)
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fsX, y = load_data('breast-cancer.csv')X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)y_train_enc, y_test_enc = prepare_targets(y_train, y_test)X_train_fs, X_test_fs, fs = select_features(X_train_enc, y_train_enc, X_test_enc)for i in range(len(fs.scores_)):
print(f'Feature {i}: {fs.scores_[i]:.3f}')pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()Model Built Using Mutual Information Features
model = LogisticRegression(solver='lbfgs')model.fit(X_train_fs, y_train_enc)yhat = model.predict(X_test_fs)accuracy = accuracy_score(y_test_enc, yhat)print(f'Accuracy: {accuracy*100: 0.2f}%')Select Numerical Input Features
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_scoreDiabetes Numerical Dataset
def load_data(filename):
df = read_csv(data_path + filename)
data = df.values
X = data[:, :-1]
y = data[:, -1]
return X,yX, y = load_data('pima-indians-diabetes.csv')X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)print(X_train.shape, y_train.shape)print(X_test.shape, y_test.shape)Model Built Using All Features
model = LogisticRegression(solver='liblinear')model.fit(X_train, y_train)yhat = model.predict(X_test)accuracy = accuracy_score(y_test, yhat)print(f'Accuracy: {accuracy*100:.2f}')Numerical Feature Selection
from sklearn.feature_selection import SelectKBest, f_classifANOVA F-test Feature Selection
def select_features(X_train, y_train, X_test, k='all'):
fs = SelectKBest(score_func=f_classif, k=k)
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fsX, y = load_data('pima-indians-diabetes.csv')X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)for i in range(len(fs.scores_)):
print(f'Feature {i}: {fs.scores_[i]: .3f}')pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()Model Built Using ANOVA F-test Features
model = LogisticRegression(solver='liblinear')model.fit(X_train_fs, y_train)yhat = model.predict(X_test_fs)accuracy = accuracy_score(y_test, yhat)print(f'Accuracy: {accuracy*100:.3f}')Mutual Information Feature Selection
from sklearn.feature_selection import SelectKBest, mutual_info_classifdef select_features(X_train, y_tarin, X_test, k='all'):
fs = SelectKBest(score_func=mutual_info_classif, k=k)
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fsSelect Features for Numerical Output
Perhaps the simplest case of feature selection is the case where there are numerical input variables and a numerical target for regression predictive modeling.
In this tutorial, you will discover how to perform feature selection with numerical input data for regression predictive modeling.
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regressionModel Built Using All Features
X, y = make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)model = LinearRegression()model.fit(X_train, y_train)yhat = model.predict(X_test)mae = mean_absolute_error(y_test, yhat)print(f'MAE:{mae:0.3f}')Model Built Using Correlation Features
def select_features(X_train, y_train, X_test):
fs = SelectKBest(score_func=f_regression, k=88)
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fsX, y = make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)model = LinearRegression()model.fit(X_train_fs, y_train)yhat = model.predict(X_test_fs)mae = mean_absolute_error(y_test, yhat)print(f'MAE: {mae:0.3f}')Model Built Using Mutual Information Features
from sklearn.feature_selection import mutual_info_regressiondef select_features(X_train, y_train, X_test):
fs = SelectKBest(score_func=mutual_info_regression, k=88)
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fsX, y = make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1)X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)x_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)model = LinearRegression()model.fit(X_train_fs, y_train)yhat = model.predict(X_test_fs)mae = mean_absolute_error(y_test, yhat)print(f'MAE: {mae:0.3f}')Tune the Number of Selected Features
from numpy import mean, std
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold, cross_val_score
from matplotlib import pyplotX, y = make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1)num_features = [i for i in range(X.shape[1]-19, X.shape[1]+1)]results = list()for k in num_features:
model = LinearRegression()
fs = SelectKBest(score_func=mutual_info_regression, k=k)
pipeline = Pipeline(steps=[('sel', fs), ('lr', model)])
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv)
results.append(scores)
print(f'{k:} {mean(scores):0.3f} {std(scores):0.3f}')py