= '/home/naji/Desktop/github-repos/machine-learning/nbs/0-datasets/data/' data_path
Feature Selection
from pandas import read_csv
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
Select Categorical Input Features
# load the dataset
def load_data(filename):
= read_csv(data_path + filename, header=None)
df = df.values
data = data[:, :-1], data[:, -1]
X , y = X.astype(str)
X return X, y
# prepare input data
def prepare_inputs(X_train, X_test):
= OrdinalEncoder()
ordenc
ordenc.fit(X_train)= ordenc.transform(X_train)
X_train_enc = ordenc.transform(X_test)
X_test_enc return X_train_enc, X_test_enc
def prepare_targets(y_train, y_test):
= LabelEncoder()
labenc
labenc.fit(y_train)= labenc.transform(y_train)
y_train_enc = labenc.transform(y_test)
y_test_enc return y_train_enc, y_test_enc
Breast Cancer Categorical Dataset
= read_csv(data_path + 'breast-cancer.csv', header=None)
df 10) df.head(
= load_data('breast-cancer.csv') X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= prepare_inputs(X_train, X_test) X_train_enc, X_test_enc
= prepare_targets(y_train, y_test) y_train_enc, y_test_enc
X.shape, y.shape
5, :] X[:
5] y[:
X_train_enc.shape, X_test_enc.shape
5, :] X_train_enc[:
Model Built Using All Features
= LogisticRegression(solver='lbfgs') model
model.fit(X_train_enc, y_train_enc)
= model.predict(X_test_enc) yhat
= accuracy_score(y_test_enc, yhat) accuracy
print(f'Accuracy: {accuracy*100:.2f}%')
Categorical Feature Selection
Chi-Squared Feature Selection
from sklearn.feature_selection import SelectKBest, chi2
def select_features(X_train, y_train, X_test, k='all'):
= SelectKBest(score_func=chi2, k=k)
fs
fs.fit(X_train, y_train)= fs.transform(X_train)
X_train_fs = fs.transform(X_test)
X_test_fs return X_train_fs, X_test_fs, fs
= load_data('breast-cancer.csv') X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= prepare_inputs(X_train, X_test) X_train_enc, X_test_enc
= prepare_targets(y_train, y_test) y_train_enc, y_test_enc
= select_features(X_train_enc, y_train_enc, X_test_enc) X_train_fs, X_test_fs, fs
for i in range(len(fs.scores_)):
print(f'Features {i}: {fs.scores_[i]:.3f}')
for i in range(len(fs.scores_))], fs.scores_)
pyplot.bar([i pyplot.show()
Model Built Using Chi-Squared Features
= LogisticRegression(solver='lbfgs') model
model.fit(X_train_fs, y_train_enc)
= model.predict(X_test_fs) yhat
= accuracy_score(y_test_enc, yhat) accuracy
print(f'Accuracy: {accuracy*100:.3f}%')
Mutual Information Feature Selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif
def select_features(X_train, y_train, X_test, k='all'):
= SelectKBest(score_func=mutual_info_classif, k=k)
fs
fs.fit(X_train, y_train)= fs.transform(X_train)
X_train_fs = fs.transform(X_test)
X_test_fs return X_train_fs, X_test_fs, fs
= load_data('breast-cancer.csv') X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= prepare_inputs(X_train, X_test) X_train_enc, X_test_enc
= prepare_targets(y_train, y_test) y_train_enc, y_test_enc
= select_features(X_train_enc, y_train_enc, X_test_enc) X_train_fs, X_test_fs, fs
for i in range(len(fs.scores_)):
print(f'Feature {i}: {fs.scores_[i]:.3f}')
for i in range(len(fs.scores_))], fs.scores_)
pyplot.bar([i pyplot.show()
Model Built Using Mutual Information Features
= LogisticRegression(solver='lbfgs') model
model.fit(X_train_fs, y_train_enc)
= model.predict(X_test_fs) yhat
= accuracy_score(y_test_enc, yhat) accuracy
print(f'Accuracy: {accuracy*100: 0.2f}%')
Select Numerical Input Features
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
Diabetes Numerical Dataset
def load_data(filename):
= read_csv(data_path + filename)
df = df.values
data = data[:, :-1]
X = data[:, -1]
y return X,y
= load_data('pima-indians-diabetes.csv') X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
Model Built Using All Features
= LogisticRegression(solver='liblinear') model
model.fit(X_train, y_train)
= model.predict(X_test) yhat
= accuracy_score(y_test, yhat) accuracy
print(f'Accuracy: {accuracy*100:.2f}')
Numerical Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif
ANOVA F-test Feature Selection
def select_features(X_train, y_train, X_test, k='all'):
= SelectKBest(score_func=f_classif, k=k)
fs
fs.fit(X_train, y_train)= fs.transform(X_train)
X_train_fs = fs.transform(X_test)
X_test_fs return X_train_fs, X_test_fs, fs
= load_data('pima-indians-diabetes.csv') X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= select_features(X_train, y_train, X_test) X_train_fs, X_test_fs, fs
for i in range(len(fs.scores_)):
print(f'Feature {i}: {fs.scores_[i]: .3f}')
for i in range(len(fs.scores_))], fs.scores_)
pyplot.bar([i pyplot.show()
Model Built Using ANOVA F-test Features
= LogisticRegression(solver='liblinear') model
model.fit(X_train_fs, y_train)
= model.predict(X_test_fs) yhat
= accuracy_score(y_test, yhat) accuracy
print(f'Accuracy: {accuracy*100:.3f}')
Mutual Information Feature Selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif
def select_features(X_train, y_tarin, X_test, k='all'):
= SelectKBest(score_func=mutual_info_classif, k=k)
fs
fs.fit(X_train, y_train)= fs.transform(X_train)
X_train_fs = fs.transform(X_test)
X_test_fs return X_train_fs, X_test_fs, fs
Select Features for Numerical Output
Perhaps the simplest case of feature selection is the case where there are numerical input variables and a numerical target for regression predictive modeling.
In this tutorial, you will discover how to perform feature selection with numerical input data for regression predictive modeling.
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
Model Built Using All Features
= make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1) X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= LinearRegression() model
model.fit(X_train, y_train)
= model.predict(X_test) yhat
= mean_absolute_error(y_test, yhat) mae
print(f'MAE:{mae:0.3f}')
Model Built Using Correlation Features
def select_features(X_train, y_train, X_test):
= SelectKBest(score_func=f_regression, k=88)
fs
fs.fit(X_train, y_train)= fs.transform(X_train)
X_train_fs = fs.transform(X_test)
X_test_fs return X_train_fs, X_test_fs, fs
= make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1) X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= select_features(X_train, y_train, X_test) X_train_fs, X_test_fs, fs
= LinearRegression() model
model.fit(X_train_fs, y_train)
= model.predict(X_test_fs) yhat
= mean_absolute_error(y_test, yhat) mae
print(f'MAE: {mae:0.3f}')
Model Built Using Mutual Information Features
from sklearn.feature_selection import mutual_info_regression
def select_features(X_train, y_train, X_test):
= SelectKBest(score_func=mutual_info_regression, k=88)
fs
fs.fit(X_train, y_train)= fs.transform(X_train)
X_train_fs = fs.transform(X_test)
X_test_fs return X_train_fs, X_test_fs, fs
= make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1) X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= select_features(X_train, y_train, X_test) x_train_fs, X_test_fs, fs
= LinearRegression() model
model.fit(X_train_fs, y_train)
= model.predict(X_test_fs) yhat
= mean_absolute_error(y_test, yhat) mae
print(f'MAE: {mae:0.3f}')
Tune the Number of Selected Features
from numpy import mean, std
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold, cross_val_score
from matplotlib import pyplot
= make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1) X, y
= [i for i in range(X.shape[1]-19, X.shape[1]+1)] num_features
= list() results
for k in num_features:
= LinearRegression()
model = SelectKBest(score_func=mutual_info_regression, k=k)
fs = Pipeline(steps=[('sel', fs), ('lr', model)])
pipeline = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
cv = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv)
scores
results.append(scores)print(f'{k:} {mean(scores):0.3f} {std(scores):0.3f}')
py