from matplotlib import pyplot
from numpy import arange
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.datasets import make_classification, make_regression
Bagging
= 5 seed
def make_clas_dataset(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=5):
= make_classification(n_samples=n_samples, n_features=n_features,
X, Y =n_informative, n_redundant=n_redundant,
n_informative=random_state)
random_statereturn X, Y
def make_reg_dataset(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=5):
= make_regression(n_samples=n_samples, n_features=n_features,
X, Y =n_informative, noise=noise, random_state=random_state)
n_informativereturn X, Y
def evaluate_clas_model(model, X, Y):
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv = cross_val_score(model, X, Y, scoring='accuracy', cv=cv)
results return results
def show_results(models, X, Y):
= [], []
results, names for name, model in models.items():
= evaluate_clas_model(model, X, Y)
scores
results.append(scores)
names.append(name)print(f'{name}: {scores.mean(): .3f} ({scores.std(): .3f})')
=names, showmeans=True)
pyplot.boxplot(results, labels pyplot.show()
Bagged Decision Trees Ensemble
Bagging is an ensemble machine learning algorithm that combines the predictions from many decision trees. It is also easy to implement given that it has few key hyperparameters and sensible heuristics for configuring these hyperparameters. Bagging performs well in general and provides the basis for a whole field of ensemble of decision tree algorithms such as the popular random forest and extra trees ensemble algorithms, as well as the lesser-known Pasting, Random Subspaces, and Random Patches ensemble algorithms.
Bagging Ensemble Algorithm
Bootstrap Aggregation, or Bagging for short, is an ensemble machine learning algorithm. Specifically, it is an ensemble of decision tree models, although the bagging technique can also be used to combine the predictions of other types of models. As its name suggests, bootstrap aggregation is based on the idea of the bootstrap sample. A bootstrap sample is a sample of a dataset with replacement. Replacement means that a sample drawn from the dataset is replaced, allowing it to be selected again and perhaps multiple times in the new sample. This means that the sample may have duplicate examples from the original dataset. The bootstrap sampling technique is used to estimate a population statistic from a small data sample. This is achieved by drawing multiple bootstrap samples, calculating the statistic on each, and reporting the mean statistic across all samples.
An example of using bootstrap sampling would be estimating the population mean from a small dataset. Multiple bootstrap samples are drawn from the dataset, the mean calculated on each, then the mean of the estimated means is reported as an estimate of the population mean. Surprisingly, the bootstrap method provides a robust and accurate approach to estimating statistical quantities compared to a single estimate on the original dataset.
This same approach can be used to create an ensemble of decision tree models. This is achieved by drawing multiple bootstrap samples from the training dataset and fitting a decision tree on each. The predictions from the decision trees are then combined to provide a more robust and accurate prediction than a single decision tree (typically, but not always).
Predictions are made for regression problems by averaging the prediction across the decision trees. Predictions are made for classification problems by taking the majority vote prediction for the classes from across the predictions made by the decision trees. The bagged decision trees are effective because each decision tree is fit on a slightly different training dataset, which in turn allows each tree to have minor differences and make slightly different skillful predictions. Technically, we say that the method is effective because the trees have a low correlation between predictions and, in turn, prediction errors.
Decision trees, specifically unpruned decision trees, are used as they slightly overfit the training data and have a high variance. Other high-variance machine learning algorithms can be used, such as a k-nearest neighbors algorithm with a low k value, although decision trees have proven to be the most effective.
Bagging does not always offer an improvement. For low-variance models that already perform well, bagging can result in a decrease in model performance.
Evaluate Bagging Ensembles
Bagging for Classification
= [-3.47224758, 1.95378146, 0.04875169, -0.91592588, -3.54022468, 1.96405547,
row -7.72564954, -2.64787168, -1.81726906, -1.67104974, 2.33762043, -4.30273117, 0.4839841,
-1.28253034, -10.6704077, -0.7641103, -3.58493721, 2.07283886, 0.08385173, 0.91461126]
from sklearn.ensemble import BaggingClassifier
= make_clas_dataset()
X, Y X.shape, Y.shape
((1000, 20), (1000,))
= BaggingClassifier() model
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, Y, scoring='accuracy', cv=cv)
results print(f'Accuracy: {results.mean(): .3f} ({results.std(): .3f})')
Accuracy: 0.865 ( 0.037)
We can also use the Bagging model as a final model and make predictions for classification.
model.fit(X, Y)
BaggingClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BaggingClassifier()
= model.predict([row])
yhat yhat
array([1])
print(f'Predicted Class: {yhat[0]}')
Predicted Class: 1
Bagging for Regression
from sklearn.ensemble import BaggingRegressor
= make_reg_dataset() X, Y
= BaggingRegressor() model
= RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, Y,
results ='neg_mean_absolute_error', cv=cv) scoring
print(f'MAE: {results.mean(): .3f} ({results.std(): .3f})')
MAE: -99.772 ( 10.417)
model.fit(X, Y)
BaggingRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BaggingRegressor()
= model.predict([row]) yhat
print(f'Prediction: {yhat[0]}')
Prediction: -88.26220906961095
Bagging Hyperparameters
In this section, we will take a closer look at some of the hyperparameters you should consider tuning for the Bagging ensemble and their effect on model performance.
Explore Number of Trees
An important hyperparameter for the Bagging algorithm is the number of decision trees used in the ensemble. Typically, the number of trees is increased until the model performance stabilizes. Intuition might suggest that more trees will lead to overfitting, although this is not the case. Bagging and related ensembles of decision trees algorithms (like random forest) appear to be somewhat immune to overfitting the training dataset given the stochastic nature of the learning algorithm.
def get_models(param):
= dict()
models
if param == 'n_estimators':
= [10,50]
n_trees for n in n_trees:
str(n)] = BaggingClassifier(n_estimators=n)
models[
if param == 'max_samples':
for i in arange(0.1, 1.1, 0.1):
= '%.1f' % i
key = BaggingClassifier(max_samples=i)
models[key]
if param == 'base_estimator':
= dict()
models for i in range(1,21):
= KNeighborsClassifier(n_neighbors=i)
base str(i)] = BaggingClassifier(base_estimator=base)
models[
return models
= make_clas_dataset() X, Y
= get_models('n_estimators') models
show_results(models, X, Y)
10: 0.858 ( 0.043)
50: 0.875 ( 0.038)
=True)
pyplot.boxplot(results, showmeans pyplot.show()
Explore Number of Samples
The size of the bootstrap sample can also be varied. The default is to create a bootstrap sample that has the same number of examples as the original dataset. Using a smaller dataset can increase the variance of the resulting decision trees and could result in better overall performance
= make_clas_dataset() X, Y
= get_models(param='max_samples') models
show_results(models, X, Y)
0.1: 0.799 ( 0.045)
0.2: 0.833 ( 0.038)
0.3: 0.839 ( 0.036)
0.4: 0.843 ( 0.043)
0.5: 0.849 ( 0.040)
0.6: 0.859 ( 0.035)
0.7: 0.861 ( 0.033)
0.8: 0.861 ( 0.038)
0.9: 0.854 ( 0.042)
1.0: 0.865 ( 0.037)
Explore Alternate Algorithm
Decision trees are the most common algorithm used in a bagging ensemble. The reason for this is that they are easy to configure to have a high variance and because they perform well in general. Other algorithms can be used with bagging and must be configured to have a modestly high variance. One example is the k-nearest neighbors algorithm where the k value can be set to a low value. The algorithm used in the ensemble is specified via the base estimator argument and must be set to an instance of the algorithm and algorithm configuration to use.
= make_clas_dataset() X, Y
= get_models('base_estimator') models
show_results(models, X, Y)
1: 0.884 ( 0.035)
2: 0.884 ( 0.034)
3: 0.887 ( 0.038)
4: 0.886 ( 0.035)
5: 0.886 ( 0.037)
6: 0.882 ( 0.038)
7: 0.876 ( 0.041)
8: 0.879 ( 0.038)
9: 0.876 ( 0.035)
10: 0.876 ( 0.033)
11: 0.875 ( 0.035)
12: 0.872 ( 0.031)
13: 0.876 ( 0.032)
14: 0.878 ( 0.029)
15: 0.872 ( 0.031)
16: 0.871 ( 0.036)
17: 0.874 ( 0.032)
18: 0.873 ( 0.039)
19: 0.873 ( 0.035)
20: 0.873 ( 0.036)
Random Subspace Ensemble
Random Subspace Ensemble is a machine learning algorithm that combines the predictions from multiple decision trees trained on different subsets of columns in the training dataset. Randomly varying the columns used to train each contributing member of the ensemble has the effect of introducing diversity into the ensemble and, in turn, can lift performance over using a single decision tree. It is related to other ensembles of decision trees such as bootstrap aggregation (bagging) that creates trees using different samples of rows from the training dataset, and random forest that combines ideas from bagging and the random subspace ensemble. Although decision trees are often used, the general random subspace method can be used with any machine learning model whose performance varies meaningfully with the choice of input features. In this tutorial, you will discover how to develop random subspace ensembles for classification and regression. After completing this tutorial, you will know: * Random subspace ensembles are created from decision trees fit on different samples of features (columns) in the training dataset. * How to use the random subspace ensemble for classification and regression with scikit-learn. * How to explore the effect of random subspace model hyperparameters on model perfor- mance.
Evaluate Random Subspace Ensembles
Random Subspace Ensemble for Classification
= make_clas_dataset() X, Y
= BaggingClassifier(bootstrap=False, max_features=10) model
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, Y, scoring='accuracy', cv=cv)
results print(f'{results.mean(): 0.3f} ({results.std():.3f})')
model.fit(X,Y)
= model.predict([row])
yhat print(yhat[0])
Random Subspace Ensemble for Regression
= make_reg_dataset() X, Y
= BaggingRegressor(bootstrap=False, max_features=10) model
= RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, Y, scoring='neg_mean_absolute_error', cv=cv) results
print(f'{results.mean():0.3f} ({results.std():0.3f})')
model.fit(X, Y)
= model.predict([reg_row])
yhat print(f'{int(yhat[0])}')
Random Subspace Ensemble Hyperparameters
Explore Number of Trees
def get_models():
= dict()
models = [10,50,100]
n_trees for n in n_trees:
str(n)] = BaggingClassifier(n_estimators=n, bootstrap=False, max_features=10)
models[return models
= make_clas_dataset() X, Y
= get_models()
models models
show_results(models, X, Y)
Explore Number of Features
def get_models():
= dict()
models for n in range(1,5):
str(n)] = BaggingClassifier(n_estimators=100, bootstrap=False, max_features=n)
models[return models
= make_clas_dataset() X,Y
= get_models() models
show_results(models, X, Y)
Explore Alternate Algorithm
= make_clas_dataset() X, Y
= BaggingClassifier(base_estimator=KNeighborsClassifier(), bootstrap=False, max_features=10) model
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, Y, scoring='accuracy', cv=cv)
results print(f'Mean Accuracy: {results.mean():.3f} ({results.std():.3f})')
Feature Selection Bagging Ensemble
Single Feature Selection Method Ensembles
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
ANOVA F-statistic Ensemble
from sklearn.feature_selection import SelectKBest, f_classif
def get_ensemble(n_features):
= []
models for i in range(1, n_features+1):
= SelectKBest(score_func=f_classif, k=i)
feat_selec = DecisionTreeClassifier()
model = Pipeline([('fs', feat_selec), ('m', model)])
pipe str(i), pipe))
models.append((= VotingClassifier(estimators=models, voting='hard')
ensemble return ensemble
= make_clas_dataset() X, Y
= get_ensemble(X.shape[1]) ensemble
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(ensemble, X, Y, scoring='accuracy', cv=cv)
results print(f'{results.mean(): .3f} ({results.std(): .3f})')
Mutual Information Ensemble
from sklearn.feature_selection import SelectKBest, mutual_info_classif
def get_ensemble(n_features):
= []
models for i in range(1, n_features+1):
= SelectKBest(score_func=mutual_info_classif, k=i)
fs = DecisionTreeClassifier()
model = Pipeline([('fs', fs), ('m', model)])
pipe str(i), pipe))
models.append((= VotingClassifier(estimators=models, voting='hard')
ensemble return ensemble
= make_clas_dataset() X,Y
= get_ensemble(X.shape[1]) ensemble
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(ensemble, X, Y, scoring='accuracy', cv=cv)
results print(f'{results.mean(): .3f} ({results.std(): .3f})')
Recursive Feature Selection Ensemble
from sklearn.feature_selection import RFE
def get_ensemble(n_fatures):
= []
models for i in range(1, n_fatures+1):
= RFE(estimator=DecisionTreeClassifier(), n_features_to_select=i)
fs = DecisionTreeClassifier()
model = Pipeline([('fs', fs), ('m', model)])
pipe str(i), pipe))
models.append((= VotingClassifier(estimators=models, voting='hard')
ensemble return ensemble
= make_clas_dataset() X, Y
= get_ensemble(X.shape[1]) ensemble
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(ensemble, X, Y, scoring='accuracy', cv=cv)
results print(f'{results.mean(): .3f} ({results.std(): .3f})')
Combined Feature Selection Ensembles
Ensemble With Fixed Number of Features
Ensemble With Contiguous Number of Features
Random Forest Ensemble
Evaluate Random Forest Ensembles
Random Forest for Classification
from sklearn.ensemble import RandomForestClassifier
= make_clas_dataset(random_state=3) X, Y
= RandomForestClassifier() model
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, Y, scoring='accuracy', cv=cv)
results print(f'Mean Accuracy: {results.mean():.3f} ({results.std():.3f})')
model.fit(X, Y)
= [-8.52381793, 5.24451077, -12.14967704, -2.92949242, 0.99314133, 0.67326595,
row -0.38657932, 1.27955683, -0.60712621, 3.20807316, 0.60504151, -1.38706415, 8.92444588,
-7.43027595, -2.33653219, 1.10358169, 0.21547782, 1.05057966, 0.6975331, 0.26076035]
= model.predict([row])
yhat print(yhat[0])
Random Forest for Regression
from sklearn.ensemble import RandomForestRegressor
= make_reg_dataset(random_state=2) X , Y
= RandomForestRegressor() model
= RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, Y, scoring='neg_mean_absolute_error', cv=cv)
results print(f'{results.mean():.3f} ({results.std():.3f})')
model.fit(X, Y)
= [-0.89483109, -1.0670149, -0.25448694, -0.53850126, 0.21082105, 1.37435592,
row 0.71203659, 0.73093031, -1.25878104, -2.01656886, 0.51906798, 0.62767387, 0.96250155,
1.31410617, -1.25527295, -0.85079036, 0.24129757, -0.17571721, -1.11454339, 0.36268268]
= model.predict([row])
yhat print(f'{yhat[0]}')
Random Forest Hyperparameters
from sklearn.ensemble import RandomForestClassifier
Explore Number of Samples
from numpy import arange
def get_models():
= dict()
models for i in arange(0.1, 1.1, 0.1):
= '%.1f' % i
key if i == 1.0:
= None
i = RandomForestClassifier(max_samples=i)
models[key] return models
= make_clas_dataset() X , y
= get_models() models
show_results(models, X, y)
Explore Number of Features
def get_models():
= dict()
models for i in range(1,8):
str(i)] = RandomForestClassifier(max_features=i)
models[return models
= make_clas_dataset() X, y
= get_models() models
show_results(models, X, y)
Explore Number of Trees
def get_models():
= dict()
models = [10, 50, 100]
n_trees for n in n_trees:
str(n)] = RandomForestClassifier(n_estimators=n)
models[return models
= make_clas_dataset() X, y
= get_models()
models models
show_results(models, X, y)
Explore Tree Depth
def get_models():
= dict()
models = [i for i in range(1,4)] + [None]
depths for n in depths:
str(n)] = RandomForestClassifier(max_depth=n)
models[return models
= make_clas_dataset() X, y
= get_models()
models models
show_results(models, X, y)
Extra Trees Ensemble
Evaluate Extra Trees Ensembles
Extra Trees for Classification
from sklearn.ensemble import ExtraTreesClassifier
= make_clas_dataset(random_state=4) X, y
= ExtraTreesClassifier() model
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, y, scoring='accuracy', cv=cv)
results print(f'{results.mean(): .3f} ({results.std(): .3f})')
0.907 ( 0.025)
model.fit(X, y)
ExtraTreesClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ExtraTreesClassifier()
= [-3.52169364, 4.00560592, 2.94756812, -0.09755101, -0.98835896, 1.81021933,
row -0.32657994, 1.08451928, 4.98150546, -2.53855736, 3.43500614, 1.64660497, -4.1557091,
-1.55301045, -0.30690987, -1.47665577, 6.818756, 0.5132918, 4.3598337, -4.31785495]
= model.predict([row])
yhat print(f'{yhat[0]}')
0
Extra Trees for Regression
from sklearn.ensemble import ExtraTreesRegressor
= make_reg_dataset(random_state=3) X, y
= ExtraTreesRegressor() model
= RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv)
results print(f'{results.mean(): .3f} ({results.std(): .3f})')
-69.667 ( 4.984)
model.fit(X, y)
ExtraTreesRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ExtraTreesRegressor()
= [-0.56996683, 0.80144889, 2.77523539, 1.32554027, -1.44494378, -0.80834175,
row -0.84142896, 0.57710245, 0.96235932, -0.66303907, -1.13994112, 0.49887995, 1.40752035,
-0.2995842, -0.05708706, -2.08701456, 1.17768469, 0.13474234, 0.09518152, -0.07603207]
= model.predict([row])
yhat print(f'{int(yhat[0])}')
53
Extra Trees Hyperparameters
from sklearn.ensemble import ExtraTreesClassifier
Explore Number of Trees
def get_models():
= dict()
models = [10, 50, 100]
n_trees for n in n_trees:
str(n)] = ExtraTreesClassifier(n_estimators=n)
models[return models
= make_clas_dataset(random_state=4) X, y
= get_models()
models models
{'10': ExtraTreesClassifier(n_estimators=10),
'50': ExtraTreesClassifier(n_estimators=50),
'100': ExtraTreesClassifier()}
show_results(models, X, y)
10: 0.855 ( 0.025)
50: 0.902 ( 0.028)
100: 0.906 ( 0.023)
Explore Number of Features
def get_models():
= dict()
models for i in range(1, 5):
str(i)] = ExtraTreesClassifier(max_features=i)
models[return models
= make_clas_dataset(random_state=4) X , y
= get_models()
models models
{'1': ExtraTreesClassifier(max_features=1),
'2': ExtraTreesClassifier(max_features=2),
'3': ExtraTreesClassifier(max_features=3),
'4': ExtraTreesClassifier(max_features=4)}
show_results(models, X, y)
1: 0.895 ( 0.025)
2: 0.901 ( 0.026)
3: 0.907 ( 0.028)
4: 0.904 ( 0.025)
Explore Minimum Samples per Split
def get_models():
= dict()
models for i in range(2, 5):
str(i)] = ExtraTreesClassifier(min_samples_split=i)
models[return models
= make_clas_dataset(random_state=4) X, y
= get_models()
models models
{'2': ExtraTreesClassifier(),
'3': ExtraTreesClassifier(min_samples_split=3),
'4': ExtraTreesClassifier(min_samples_split=4)}
show_results(models, X, y)
2: 0.909 ( 0.026)
3: 0.913 ( 0.025)
4: 0.909 ( 0.027)