Feature Selection

data_path = '/home/naji/Desktop/github-repos/machine-learning/nbs/0-datasets/data/'
from pandas import read_csv
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Select Categorical Input Features

# load the dataset
def load_data(filename):
    df = read_csv(data_path + filename, header=None)
    data = df.values
    X , y = data[:, :-1], data[:, -1]
    X = X.astype(str)
    return X, y
# prepare input data
def prepare_inputs(X_train, X_test):
    ordenc = OrdinalEncoder()
    ordenc.fit(X_train)
    X_train_enc = ordenc.transform(X_train)
    X_test_enc = ordenc.transform(X_test)
    return X_train_enc, X_test_enc
def prepare_targets(y_train, y_test):
    labenc = LabelEncoder()
    labenc.fit(y_train)
    y_train_enc = labenc.transform(y_train)
    y_test_enc = labenc.transform(y_test)
    return y_train_enc, y_test_enc

Breast Cancer Categorical Dataset

df = read_csv(data_path + 'breast-cancer.csv', header=None)
df.head(10)
X, y = load_data('breast-cancer.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
X.shape, y.shape
X[:5, :]
y[:5]
X_train_enc.shape, X_test_enc.shape
X_train_enc[:5, :]

Model Built Using All Features

model = LogisticRegression(solver='lbfgs')
model.fit(X_train_enc, y_train_enc)
yhat = model.predict(X_test_enc)
accuracy = accuracy_score(y_test_enc, yhat)
print(f'Accuracy: {accuracy*100:.2f}%')

Categorical Feature Selection

Chi-Squared Feature Selection

from sklearn.feature_selection import SelectKBest, chi2
def select_features(X_train, y_train, X_test, k='all'):
    fs = SelectKBest(score_func=chi2, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
X, y = load_data('breast-cancer.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
X_train_fs, X_test_fs, fs = select_features(X_train_enc, y_train_enc, X_test_enc)
for i in range(len(fs.scores_)):
    print(f'Features {i}: {fs.scores_[i]:.3f}')
pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()

Model Built Using Chi-Squared Features

model = LogisticRegression(solver='lbfgs')
model.fit(X_train_fs, y_train_enc)
yhat = model.predict(X_test_fs)
accuracy = accuracy_score(y_test_enc, yhat)
print(f'Accuracy: {accuracy*100:.3f}%')

Mutual Information Feature Selection

from sklearn.feature_selection import SelectKBest, mutual_info_classif
def select_features(X_train, y_train, X_test, k='all'):
    fs = SelectKBest(score_func=mutual_info_classif, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
X, y = load_data('breast-cancer.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
X_train_fs, X_test_fs, fs = select_features(X_train_enc, y_train_enc, X_test_enc)
for i in range(len(fs.scores_)):
    print(f'Feature {i}: {fs.scores_[i]:.3f}')
pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()

Model Built Using Mutual Information Features

model = LogisticRegression(solver='lbfgs')
model.fit(X_train_fs, y_train_enc)
yhat = model.predict(X_test_fs)
accuracy = accuracy_score(y_test_enc, yhat)
print(f'Accuracy: {accuracy*100: 0.2f}%')

Select Numerical Input Features

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Diabetes Numerical Dataset

def load_data(filename):
    df = read_csv(data_path + filename)
    data = df.values
    X = data[:, :-1]
    y = data[:, -1]
    return X,y
X, y = load_data('pima-indians-diabetes.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

Model Built Using All Features

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
yhat = model.predict(X_test)
accuracy = accuracy_score(y_test, yhat)
print(f'Accuracy: {accuracy*100:.2f}')

Numerical Feature Selection

from sklearn.feature_selection import SelectKBest, f_classif

ANOVA F-test Feature Selection

def select_features(X_train, y_train, X_test, k='all'):
    fs = SelectKBest(score_func=f_classif, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
X, y = load_data('pima-indians-diabetes.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
for i in range(len(fs.scores_)):
    print(f'Feature {i}: {fs.scores_[i]: .3f}')
pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()

Model Built Using ANOVA F-test Features

model = LogisticRegression(solver='liblinear')
model.fit(X_train_fs, y_train)
yhat = model.predict(X_test_fs)
accuracy = accuracy_score(y_test, yhat)
print(f'Accuracy: {accuracy*100:.3f}')

Mutual Information Feature Selection

from sklearn.feature_selection import SelectKBest, mutual_info_classif
def select_features(X_train, y_tarin, X_test, k='all'):
    fs = SelectKBest(score_func=mutual_info_classif, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

Select Features for Numerical Output

Perhaps the simplest case of feature selection is the case where there are numerical input variables and a numerical target for regression predictive modeling.

In this tutorial, you will discover how to perform feature selection with numerical input data for regression predictive modeling.

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression

Model Built Using All Features

X, y = make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
model = LinearRegression()
model.fit(X_train, y_train)
yhat = model.predict(X_test)
mae = mean_absolute_error(y_test, yhat)
print(f'MAE:{mae:0.3f}')

Model Built Using Correlation Features

def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=f_regression, k=88)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
X, y = make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
model = LinearRegression()
model.fit(X_train_fs, y_train)
yhat = model.predict(X_test_fs)
mae = mean_absolute_error(y_test, yhat)
print(f'MAE: {mae:0.3f}')

Model Built Using Mutual Information Features

from sklearn.feature_selection import mutual_info_regression
def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=mutual_info_regression, k=88)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
X, y = make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
x_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
model = LinearRegression()
model.fit(X_train_fs, y_train)
yhat = model.predict(X_test_fs)
mae = mean_absolute_error(y_test, yhat)
print(f'MAE: {mae:0.3f}')

Tune the Number of Selected Features

from numpy import mean, std
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold, cross_val_score
from matplotlib import pyplot
X, y = make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=1)
num_features = [i for i in range(X.shape[1]-19, X.shape[1]+1)]
results = list()
for k in num_features:
    model = LinearRegression()
    fs = SelectKBest(score_func=mutual_info_regression, k=k)
    pipeline = Pipeline(steps=[('sel', fs), ('lr', model)])
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv)
    results.append(scores)
    print(f'{k:} {mean(scores):0.3f}   {std(scores):0.3f}')
py

How to Use RFE for Feature Selection

How to Use Feature Importance