= '/home/naji/Desktop/github-repos/machine-learning/nbs/0-datasets/data/' data_path
Data Transforms
Scale Numerical Data
Numerical Data Scaling Methods
from pandas import read_csv, DataFrame
from numpy import asarray, mean, std
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
Data Normalization
from sklearn.preprocessing import MinMaxScaler
= asarray([[100, 0.001],
data 8, 0.05],
[50, 0.005],
[88, 0.07],
[4, 0.1]]) [
print(data)
[[1.0e+02 1.0e-03]
[8.0e+00 5.0e-02]
[5.0e+01 5.0e-03]
[8.8e+01 7.0e-02]
[4.0e+00 1.0e-01]]
= MinMaxScaler() scaler
= scaler.fit_transform(data) scaled
print(scaled)
[[1. 0. ]
[0.04166667 0.49494949]
[0.47916667 0.04040404]
[0.875 0.6969697 ]
[0. 1. ]]
Data Standardization
from sklearn.preprocessing import StandardScaler
= asarray([[100, 0.001],
data 8, 0.05],
[50, 0.005],
[88, 0.07],
[4, 0.1]]) [
= StandardScaler() scaler
= scaler.fit_transform(data) scaled
print(scaled)
[[ 1.26398112 -1.16389967]
[-1.06174414 0.12639634]
[ 0. -1.05856939]
[ 0.96062565 0.65304778]
[-1.16286263 1.44302493]]
Diabetes Dataset
= read_csv(data_path + 'pima-indians-diabetes.csv', header=None)
df df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
768 rows × 9 columns
df.shape
(768, 9)
print(df.describe())
0 1 2 ... 6 7 8
count 768.000000 768.000000 768.000000 ... 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 ... 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 ... 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 ... 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 ... 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 ... 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 ... 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 ... 2.420000 81.000000 1.000000
[8 rows x 9 columns]
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32') X
= LabelEncoder().fit_transform(y.astype('str')) y
= KNeighborsClassifier() model
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, y, scoring='accuracy', cv=cv) scores
print(f'Accuracy: {mean(scores):0.3f} {std(scores):0.3f}')
Accuracy: 0.717 0.040
MinMaxScaler Transform
= read_csv(data_path + 'pima-indians-diabetes.csv', header=None) df
= df.values data
= MinMaxScaler() trans
= trans.fit_transform(data) data
= DataFrame(data) df
print(df.describe())
0 1 2 ... 6 7 8
count 768.000000 768.000000 768.000000 ... 768.000000 768.000000 768.000000
mean 0.226180 0.607510 0.566438 ... 0.168179 0.204015 0.348958
std 0.198210 0.160666 0.158654 ... 0.141473 0.196004 0.476951
min 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000
25% 0.058824 0.497487 0.508197 ... 0.070773 0.050000 0.000000
50% 0.176471 0.587940 0.590164 ... 0.125747 0.133333 0.000000
75% 0.352941 0.704774 0.655738 ... 0.234095 0.333333 1.000000
max 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000
[8 rows x 9 columns]
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Train the model
= read_csv(data_path + 'pima-indians-diabetes.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str')) y
= MinMaxScaler() trans
= KNeighborsClassifier() model
= Pipeline(steps=[('t', trans), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) scores
print(f'Accuracy: {mean(scores):0.3f}, {std(scores):.3f}')
Accuracy: 0.739, 0.053
StandardScaler Transform
= read_csv(data_path + 'pima-indians-diabetes.csv', header=None) df
= df.values[:, :-1] data
= StandardScaler() trans
= trans.fit_transform(data) data
= DataFrame(data) df
print(df.describe())
0 1 ... 6 7
count 7.680000e+02 7.680000e+02 ... 7.680000e+02 7.680000e+02
mean -6.476301e-17 -9.251859e-18 ... 2.451743e-16 1.931325e-16
std 1.000652e+00 1.000652e+00 ... 1.000652e+00 1.000652e+00
min -1.141852e+00 -3.783654e+00 ... -1.189553e+00 -1.041549e+00
25% -8.448851e-01 -6.852363e-01 ... -6.889685e-01 -7.862862e-01
50% -2.509521e-01 -1.218877e-01 ... -3.001282e-01 -3.608474e-01
75% 6.399473e-01 6.057709e-01 ... 4.662269e-01 6.602056e-01
max 3.906578e+00 2.444478e+00 ... 5.883565e+00 4.063716e+00
[8 rows x 8 columns]
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Train the model
= read_csv(data_path + 'pima-indians-diabetes.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32') X
= LabelEncoder().fit_transform(y.astype('str')) y
= StandardScaler() trans
= KNeighborsClassifier() model
= Pipeline(steps=[('t', trans), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) scores
print(f'Accuracy: {mean(scores):.3f} {std(scores):.3f}')
Accuracy: 0.741 0.050
Scale Data With Outliers
IQR Robust Scaler Transform
from sklearn.preprocessing import RobustScaler
= read_csv(data_path + 'pima-indians-diabetes.csv', header=None) df
= df.values[:, :-1] data
= RobustScaler() trans
= trans.fit_transform(data) data
= DataFrame(data) df
print(df.describe())
0 1 2 ... 5 6 7
count 768.000000 768.000000 768.000000 ... 768.000000 768.000000 768.000000
mean 0.169010 0.094413 -0.160807 ... -0.000798 0.259807 0.249464
std 0.673916 0.775094 1.075323 ... 0.847759 0.866219 0.691778
min -0.600000 -2.836364 -4.000000 ... -3.440860 -0.769935 -0.470588
25% -0.400000 -0.436364 -0.555556 ... -0.505376 -0.336601 -0.294118
50% 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000
75% 0.600000 0.563636 0.444444 ... 0.494624 0.663399 0.705882
max 2.800000 1.987879 2.777778 ... 3.774194 5.352941 3.058824
[8 rows x 8 columns]
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Train the model
= read_csv(data_path + 'pima-indians-diabetes.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32') X
= LabelEncoder().fit_transform(y.astype('str')) y
= RobustScaler() trans
= KNeighborsClassifier() model
= Pipeline(steps=[('t', trans), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) scores
print(f'Accuracy: {mean(scores):.3f} {std(scores):.3f}')
Accuracy: 0.734 0.044
Explore Robust Scaler Range
def get_dataset():
= read_csv(data_path + 'pima-indians-diabetes.csv', header=None)
df = df.values
data = data[:, :-1], data[:, -1]
X, y = X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str'))
y return X, y
def get_models():
= dict()
models for value in [1, 5, 10, 20, 25, 30]:
= RobustScaler(quantile_range=(value, 100-value))
trans = KNeighborsClassifier()
model str(value)] = Pipeline(steps=[('t', trans), ('m', model)])
models[return models
def evaluate_model(model, X, y):
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv = cross_val_score(model, X, y, scoring='accuracy', cv=cv)
scores return scores
= get_dataset() x, y
= get_models() models
= list(), list() results, names
for name, model in models.items():
= evaluate_model(model, X, y)
scores
results.append(scores)
names.append(name)print(f'{name:} {mean(scores):.3f} {std(scores):.3f}')
1 0.734 0.054
5 0.736 0.051
10 0.739 0.047
20 0.734 0.050
25 0.734 0.044
30 0.735 0.042
=names, showmeans=True)
pyplot.boxplot(results, labels pyplot.show()
Encode Categorical Data
Encoding Categorical Data
Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder
= asarray([['red'], ['green'], ['blue']]) data
print(data)
[['red']
['green']
['blue']]
= OrdinalEncoder() encoder
= encoder.fit_transform(data) result
print(result)
[[2.]
[1.]
[0.]]
One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
= asarray([['red'], ['green'], ['blue']]) data
print(data)
[['red']
['green']
['blue']]
= OneHotEncoder(sparse=False) encoder
= encoder.fit_transform(data) onehot
print(onehot)
[[0. 0. 1.]
[0. 1. 0.]
[1. 0. 0.]]
Dummy Variable Encoding
from sklearn.preprocessing import OneHotEncoder
= asarray([['red'],['green'],['blue']])
data print(data)
[['red']
['green']
['blue']]
= OneHotEncoder(drop='first', sparse=False) encoder
= encoder.fit_transform(data)
onehot print(onehot)
[[0. 1.]
[1. 0.]
[0. 0.]]
Breast Cancer Dataset
= read_csv(data_path + 'breast-cancer.csv', header=None)
df df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | '40-49' | 'premeno' | '15-19' | '0-2' | 'yes' | '3' | 'right' | 'left_up' | 'no' | 'recurrence-events' |
1 | '50-59' | 'ge40' | '15-19' | '0-2' | 'no' | '1' | 'right' | 'central' | 'no' | 'no-recurrence-events' |
2 | '50-59' | 'ge40' | '35-39' | '0-2' | 'no' | '2' | 'left' | 'left_low' | 'no' | 'recurrence-events' |
3 | '40-49' | 'premeno' | '35-39' | '0-2' | 'yes' | '3' | 'right' | 'left_low' | 'yes' | 'no-recurrence-events' |
4 | '40-49' | 'premeno' | '30-34' | '3-5' | 'yes' | '2' | 'left' | 'right_up' | 'no' | 'recurrence-events' |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
281 | '50-59' | 'ge40' | '30-34' | '6-8' | 'yes' | '2' | 'left' | 'left_low' | 'no' | 'no-recurrence-events' |
282 | '50-59' | 'premeno' | '25-29' | '3-5' | 'yes' | '2' | 'left' | 'left_low' | 'yes' | 'no-recurrence-events' |
283 | '30-39' | 'premeno' | '30-34' | '6-8' | 'yes' | '2' | 'right' | 'right_up' | 'no' | 'no-recurrence-events' |
284 | '50-59' | 'premeno' | '15-19' | '0-2' | 'no' | '2' | 'right' | 'left_low' | 'no' | 'no-recurrence-events' |
285 | '50-59' | 'ge40' | '40-44' | '0-2' | 'no' | '3' | 'left' | 'right_up' | 'no' | 'no-recurrence-events' |
286 rows × 10 columns
= df.values data
= data[:, :-1].astype(str)
X = data[:, -1].astype(str) y
print('Input', X.shape)
Input (286, 9)
print('Output', y.shape)
Output (286,)
OrdinalEncoder Transform
= OrdinalEncoder() ordinal_encoder
= ordinal_encoder.fit_transform(X) X
= LabelEncoder() label_encoder
= label_encoder.fit_transform(y) y
print('Input', X.shape)
Input (286, 9)
10, :] X[:
array([[2., 2., 2., 0., 1., 2., 1., 2., 0.],
[3., 0., 2., 0., 0., 0., 1., 0., 0.],
[3., 0., 6., 0., 0., 1., 0., 1., 0.],
[2., 2., 6., 0., 1., 2., 1., 1., 1.],
[2., 2., 5., 4., 1., 1., 0., 4., 0.],
[3., 2., 4., 4., 0., 1., 1., 2., 1.],
[3., 0., 7., 0., 0., 2., 0., 2., 0.],
[2., 2., 1., 0., 0., 1., 0., 2., 0.],
[2., 2., 0., 0., 0., 1., 1., 3., 0.],
[2., 0., 7., 2., 1., 1., 1., 2., 1.]])
print('Output', y.shape)
Output (286,)
10] y[:
array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0])
Training a model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
= read_csv(data_path + 'breast-cancer.csv', header=None) df
= df.values data
= data[:, :-1].astype(str)
X = data[:, -1].astype(str) y
= train_test_split(X, y, test_size=.33, random_state=1) X_train, X_test, y_train, y_test
= OrdinalEncoder() ordinal_encoder
ordinal_encoder.fit(X_train)
OrdinalEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
OrdinalEncoder()
= ordinal_encoder.transform(X_train)
X_train = ordinal_encoder.transform(X_test) X_test
= LabelEncoder() label_encoder
label_encoder.fit(y_train)
LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
= label_encoder.transform(y_train) y_train
= label_encoder.transform(y_test) y_test
= LogisticRegression() model
model.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
= model.predict(X_test) yhat
= accuracy_score(y_test, yhat) accuracy
print(f'Accuracy: {accuracy*100: .3f}')
Accuracy: 75.789
OneHotEncoder Transform
= read_csv(data_path + 'breast-cancer.csv', header=None) df
= df.values data
= data[:, :-1].astype(str)
X = data[:, -1].astype(str) y
= OneHotEncoder(sparse=False) onehot_encoder
= onehot_encoder.fit_transform(X) X
= LabelEncoder() label_encoder
= label_encoder.fit_transform(y) y
print('Input', X.shape)
Input (286, 43)
print(X[:5, :])
[[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1.]
[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0.]]
Train the model
= read_csv(data_path + 'breast-cancer.csv', header=None) df
= df.values data
= data[:, :-1].astype(str)
X = data[:, -1].astype(str) y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= OneHotEncoder() onehot_encoder
onehot_encoder.fit(X_train)
OneHotEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
OneHotEncoder()
= onehot_encoder.transform(X_train)
X_train = onehot_encoder.transform(X_test) X_test
= LabelEncoder() label_encoder
label_encoder.fit(y_train)
LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
= label_encoder.transform(y_train) y_train
= label_encoder.transform(y_test) y_test
= LogisticRegression() model
model.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
= model.predict(X_test) yhat
= accuracy_score(y_test, yhat) accuracy
print(f'Accuracy: {accuracy*100: .3f}')
Accuracy: 70.526
How to Make Distributions More Gaussian
Power Transforms
from numpy import exp
from numpy.random import randn
from sklearn.preprocessing import PowerTransformer
= randn(1000) data
= exp(data) data
=25)
pyplot.hist(data, bins pyplot.show()
= data.reshape(len(data), 1) data
= PowerTransformer(method='yeo-johnson', standardize=True) power
= power.fit_transform(data) data_trans
=25)
pyplot.hist(data_trans, bins pyplot.show()
Sonar Dataset
from sklearn.preprocessing import LabelEncoder
= read_csv(data_path + 'sonar.csv', header=None) df
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0200 | 0.0371 | 0.0428 | 0.0207 | 0.0954 | 0.0986 | 0.1539 | 0.1601 | 0.3109 | 0.2111 | 0.1609 | 0.1582 | 0.2238 | 0.0645 | 0.0660 | 0.2273 | 0.3100 | 0.2999 | 0.5078 | 0.4797 | 0.5783 | 0.5071 | 0.4328 | 0.5550 | 0.6711 | 0.6415 | 0.7104 | 0.8080 | 0.6791 | 0.3857 | 0.1307 | 0.2604 | 0.5121 | 0.7547 | 0.8537 | 0.8507 | 0.6692 | 0.6097 | 0.4943 | 0.2744 | 0.0510 | 0.2834 | 0.2825 | 0.4256 | 0.2641 | 0.1386 | 0.1051 | 0.1343 | 0.0383 | 0.0324 | 0.0232 | 0.0027 | 0.0065 | 0.0159 | 0.0072 | 0.0167 | 0.0180 | 0.0084 | 0.0090 | 0.0032 | R |
1 | 0.0453 | 0.0523 | 0.0843 | 0.0689 | 0.1183 | 0.2583 | 0.2156 | 0.3481 | 0.3337 | 0.2872 | 0.4918 | 0.6552 | 0.6919 | 0.7797 | 0.7464 | 0.9444 | 1.0000 | 0.8874 | 0.8024 | 0.7818 | 0.5212 | 0.4052 | 0.3957 | 0.3914 | 0.3250 | 0.3200 | 0.3271 | 0.2767 | 0.4423 | 0.2028 | 0.3788 | 0.2947 | 0.1984 | 0.2341 | 0.1306 | 0.4182 | 0.3835 | 0.1057 | 0.1840 | 0.1970 | 0.1674 | 0.0583 | 0.1401 | 0.1628 | 0.0621 | 0.0203 | 0.0530 | 0.0742 | 0.0409 | 0.0061 | 0.0125 | 0.0084 | 0.0089 | 0.0048 | 0.0094 | 0.0191 | 0.0140 | 0.0049 | 0.0052 | 0.0044 | R |
2 | 0.0262 | 0.0582 | 0.1099 | 0.1083 | 0.0974 | 0.2280 | 0.2431 | 0.3771 | 0.5598 | 0.6194 | 0.6333 | 0.7060 | 0.5544 | 0.5320 | 0.6479 | 0.6931 | 0.6759 | 0.7551 | 0.8929 | 0.8619 | 0.7974 | 0.6737 | 0.4293 | 0.3648 | 0.5331 | 0.2413 | 0.5070 | 0.8533 | 0.6036 | 0.8514 | 0.8512 | 0.5045 | 0.1862 | 0.2709 | 0.4232 | 0.3043 | 0.6116 | 0.6756 | 0.5375 | 0.4719 | 0.4647 | 0.2587 | 0.2129 | 0.2222 | 0.2111 | 0.0176 | 0.1348 | 0.0744 | 0.0130 | 0.0106 | 0.0033 | 0.0232 | 0.0166 | 0.0095 | 0.0180 | 0.0244 | 0.0316 | 0.0164 | 0.0095 | 0.0078 | R |
3 | 0.0100 | 0.0171 | 0.0623 | 0.0205 | 0.0205 | 0.0368 | 0.1098 | 0.1276 | 0.0598 | 0.1264 | 0.0881 | 0.1992 | 0.0184 | 0.2261 | 0.1729 | 0.2131 | 0.0693 | 0.2281 | 0.4060 | 0.3973 | 0.2741 | 0.3690 | 0.5556 | 0.4846 | 0.3140 | 0.5334 | 0.5256 | 0.2520 | 0.2090 | 0.3559 | 0.6260 | 0.7340 | 0.6120 | 0.3497 | 0.3953 | 0.3012 | 0.5408 | 0.8814 | 0.9857 | 0.9167 | 0.6121 | 0.5006 | 0.3210 | 0.3202 | 0.4295 | 0.3654 | 0.2655 | 0.1576 | 0.0681 | 0.0294 | 0.0241 | 0.0121 | 0.0036 | 0.0150 | 0.0085 | 0.0073 | 0.0050 | 0.0044 | 0.0040 | 0.0117 | R |
4 | 0.0762 | 0.0666 | 0.0481 | 0.0394 | 0.0590 | 0.0649 | 0.1209 | 0.2467 | 0.3564 | 0.4459 | 0.4152 | 0.3952 | 0.4256 | 0.4135 | 0.4528 | 0.5326 | 0.7306 | 0.6193 | 0.2032 | 0.4636 | 0.4148 | 0.4292 | 0.5730 | 0.5399 | 0.3161 | 0.2285 | 0.6995 | 1.0000 | 0.7262 | 0.4724 | 0.5103 | 0.5459 | 0.2881 | 0.0981 | 0.1951 | 0.4181 | 0.4604 | 0.3217 | 0.2828 | 0.2430 | 0.1979 | 0.2444 | 0.1847 | 0.0841 | 0.0692 | 0.0528 | 0.0357 | 0.0085 | 0.0230 | 0.0046 | 0.0156 | 0.0031 | 0.0054 | 0.0105 | 0.0110 | 0.0015 | 0.0072 | 0.0048 | 0.0107 | 0.0094 | R |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
203 | 0.0187 | 0.0346 | 0.0168 | 0.0177 | 0.0393 | 0.1630 | 0.2028 | 0.1694 | 0.2328 | 0.2684 | 0.3108 | 0.2933 | 0.2275 | 0.0994 | 0.1801 | 0.2200 | 0.2732 | 0.2862 | 0.2034 | 0.1740 | 0.4130 | 0.6879 | 0.8120 | 0.8453 | 0.8919 | 0.9300 | 0.9987 | 1.0000 | 0.8104 | 0.6199 | 0.6041 | 0.5547 | 0.4160 | 0.1472 | 0.0849 | 0.0608 | 0.0969 | 0.1411 | 0.1676 | 0.1200 | 0.1201 | 0.1036 | 0.1977 | 0.1339 | 0.0902 | 0.1085 | 0.1521 | 0.1363 | 0.0858 | 0.0290 | 0.0203 | 0.0116 | 0.0098 | 0.0199 | 0.0033 | 0.0101 | 0.0065 | 0.0115 | 0.0193 | 0.0157 | M |
204 | 0.0323 | 0.0101 | 0.0298 | 0.0564 | 0.0760 | 0.0958 | 0.0990 | 0.1018 | 0.1030 | 0.2154 | 0.3085 | 0.3425 | 0.2990 | 0.1402 | 0.1235 | 0.1534 | 0.1901 | 0.2429 | 0.2120 | 0.2395 | 0.3272 | 0.5949 | 0.8302 | 0.9045 | 0.9888 | 0.9912 | 0.9448 | 1.0000 | 0.9092 | 0.7412 | 0.7691 | 0.7117 | 0.5304 | 0.2131 | 0.0928 | 0.1297 | 0.1159 | 0.1226 | 0.1768 | 0.0345 | 0.1562 | 0.0824 | 0.1149 | 0.1694 | 0.0954 | 0.0080 | 0.0790 | 0.1255 | 0.0647 | 0.0179 | 0.0051 | 0.0061 | 0.0093 | 0.0135 | 0.0063 | 0.0063 | 0.0034 | 0.0032 | 0.0062 | 0.0067 | M |
205 | 0.0522 | 0.0437 | 0.0180 | 0.0292 | 0.0351 | 0.1171 | 0.1257 | 0.1178 | 0.1258 | 0.2529 | 0.2716 | 0.2374 | 0.1878 | 0.0983 | 0.0683 | 0.1503 | 0.1723 | 0.2339 | 0.1962 | 0.1395 | 0.3164 | 0.5888 | 0.7631 | 0.8473 | 0.9424 | 0.9986 | 0.9699 | 1.0000 | 0.8630 | 0.6979 | 0.7717 | 0.7305 | 0.5197 | 0.1786 | 0.1098 | 0.1446 | 0.1066 | 0.1440 | 0.1929 | 0.0325 | 0.1490 | 0.0328 | 0.0537 | 0.1309 | 0.0910 | 0.0757 | 0.1059 | 0.1005 | 0.0535 | 0.0235 | 0.0155 | 0.0160 | 0.0029 | 0.0051 | 0.0062 | 0.0089 | 0.0140 | 0.0138 | 0.0077 | 0.0031 | M |
206 | 0.0303 | 0.0353 | 0.0490 | 0.0608 | 0.0167 | 0.1354 | 0.1465 | 0.1123 | 0.1945 | 0.2354 | 0.2898 | 0.2812 | 0.1578 | 0.0273 | 0.0673 | 0.1444 | 0.2070 | 0.2645 | 0.2828 | 0.4293 | 0.5685 | 0.6990 | 0.7246 | 0.7622 | 0.9242 | 1.0000 | 0.9979 | 0.8297 | 0.7032 | 0.7141 | 0.6893 | 0.4961 | 0.2584 | 0.0969 | 0.0776 | 0.0364 | 0.1572 | 0.1823 | 0.1349 | 0.0849 | 0.0492 | 0.1367 | 0.1552 | 0.1548 | 0.1319 | 0.0985 | 0.1258 | 0.0954 | 0.0489 | 0.0241 | 0.0042 | 0.0086 | 0.0046 | 0.0126 | 0.0036 | 0.0035 | 0.0034 | 0.0079 | 0.0036 | 0.0048 | M |
207 | 0.0260 | 0.0363 | 0.0136 | 0.0272 | 0.0214 | 0.0338 | 0.0655 | 0.1400 | 0.1843 | 0.2354 | 0.2720 | 0.2442 | 0.1665 | 0.0336 | 0.1302 | 0.1708 | 0.2177 | 0.3175 | 0.3714 | 0.4552 | 0.5700 | 0.7397 | 0.8062 | 0.8837 | 0.9432 | 1.0000 | 0.9375 | 0.7603 | 0.7123 | 0.8358 | 0.7622 | 0.4567 | 0.1715 | 0.1549 | 0.1641 | 0.1869 | 0.2655 | 0.1713 | 0.0959 | 0.0768 | 0.0847 | 0.2076 | 0.2505 | 0.1862 | 0.1439 | 0.1470 | 0.0991 | 0.0041 | 0.0154 | 0.0116 | 0.0181 | 0.0146 | 0.0129 | 0.0047 | 0.0039 | 0.0061 | 0.0040 | 0.0036 | 0.0061 | 0.0115 | M |
208 rows × 61 columns
df.shape
(208, 61)
df.describe()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 |
mean | 0.029164 | 0.038437 | 0.043832 | 0.053892 | 0.075202 | 0.104570 | 0.121747 | 0.134799 | 0.178003 | 0.208259 | 0.236013 | 0.250221 | 0.273305 | 0.296568 | 0.320201 | 0.378487 | 0.415983 | 0.452318 | 0.504812 | 0.563047 | 0.609060 | 0.624275 | 0.646975 | 0.672654 | 0.675424 | 0.699866 | 0.702155 | 0.694024 | 0.642074 | 0.580928 | 0.504475 | 0.439040 | 0.417220 | 0.403233 | 0.392571 | 0.384848 | 0.363807 | 0.339657 | 0.325800 | 0.311207 | 0.289252 | 0.278293 | 0.246542 | 0.214075 | 0.197232 | 0.160631 | 0.122453 | 0.091424 | 0.051929 | 0.020424 | 0.016069 | 0.013420 | 0.010709 | 0.010941 | 0.009290 | 0.008222 | 0.007820 | 0.007949 | 0.007941 | 0.006507 |
std | 0.022991 | 0.032960 | 0.038428 | 0.046528 | 0.055552 | 0.059105 | 0.061788 | 0.085152 | 0.118387 | 0.134416 | 0.132705 | 0.140072 | 0.140962 | 0.164474 | 0.205427 | 0.232650 | 0.263677 | 0.261529 | 0.257988 | 0.262653 | 0.257818 | 0.255883 | 0.250175 | 0.239116 | 0.244926 | 0.237228 | 0.245657 | 0.237189 | 0.240250 | 0.220749 | 0.213992 | 0.213237 | 0.206513 | 0.231242 | 0.259132 | 0.264121 | 0.239912 | 0.212973 | 0.199075 | 0.178662 | 0.171111 | 0.168728 | 0.138993 | 0.133291 | 0.151628 | 0.133938 | 0.086953 | 0.062417 | 0.035954 | 0.013665 | 0.012008 | 0.009634 | 0.007060 | 0.007301 | 0.007088 | 0.005736 | 0.005785 | 0.006470 | 0.006181 | 0.005031 |
min | 0.001500 | 0.000600 | 0.001500 | 0.005800 | 0.006700 | 0.010200 | 0.003300 | 0.005500 | 0.007500 | 0.011300 | 0.028900 | 0.023600 | 0.018400 | 0.027300 | 0.003100 | 0.016200 | 0.034900 | 0.037500 | 0.049400 | 0.065600 | 0.051200 | 0.021900 | 0.056300 | 0.023900 | 0.024000 | 0.092100 | 0.048100 | 0.028400 | 0.014400 | 0.061300 | 0.048200 | 0.040400 | 0.047700 | 0.021200 | 0.022300 | 0.008000 | 0.035100 | 0.038300 | 0.037100 | 0.011700 | 0.036000 | 0.005600 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000800 | 0.000500 | 0.001000 | 0.000600 | 0.000400 | 0.000300 | 0.000300 | 0.000100 | 0.000600 |
25% | 0.013350 | 0.016450 | 0.018950 | 0.024375 | 0.038050 | 0.067025 | 0.080900 | 0.080425 | 0.097025 | 0.111275 | 0.129250 | 0.133475 | 0.166125 | 0.175175 | 0.164625 | 0.196300 | 0.205850 | 0.242075 | 0.299075 | 0.350625 | 0.399725 | 0.406925 | 0.450225 | 0.540725 | 0.525800 | 0.544175 | 0.531900 | 0.534775 | 0.463700 | 0.411400 | 0.345550 | 0.281400 | 0.257875 | 0.217575 | 0.179375 | 0.154350 | 0.160100 | 0.174275 | 0.173975 | 0.186450 | 0.163100 | 0.158900 | 0.155200 | 0.126875 | 0.094475 | 0.068550 | 0.064250 | 0.045125 | 0.026350 | 0.011550 | 0.008425 | 0.007275 | 0.005075 | 0.005375 | 0.004150 | 0.004400 | 0.003700 | 0.003600 | 0.003675 | 0.003100 |
50% | 0.022800 | 0.030800 | 0.034300 | 0.044050 | 0.062500 | 0.092150 | 0.106950 | 0.112100 | 0.152250 | 0.182400 | 0.224800 | 0.249050 | 0.263950 | 0.281100 | 0.281700 | 0.304700 | 0.308400 | 0.368300 | 0.434950 | 0.542500 | 0.617700 | 0.664900 | 0.699700 | 0.698500 | 0.721100 | 0.754500 | 0.745600 | 0.731900 | 0.680800 | 0.607150 | 0.490350 | 0.429600 | 0.391200 | 0.351050 | 0.312750 | 0.321150 | 0.306300 | 0.312700 | 0.283500 | 0.278050 | 0.259500 | 0.245100 | 0.222550 | 0.177700 | 0.148000 | 0.121350 | 0.101650 | 0.078100 | 0.044700 | 0.017900 | 0.013900 | 0.011400 | 0.009550 | 0.009300 | 0.007500 | 0.006850 | 0.005950 | 0.005800 | 0.006400 | 0.005300 |
75% | 0.035550 | 0.047950 | 0.057950 | 0.064500 | 0.100275 | 0.134125 | 0.154000 | 0.169600 | 0.233425 | 0.268700 | 0.301650 | 0.331250 | 0.351250 | 0.386175 | 0.452925 | 0.535725 | 0.659425 | 0.679050 | 0.731400 | 0.809325 | 0.816975 | 0.831975 | 0.848575 | 0.872175 | 0.873725 | 0.893800 | 0.917100 | 0.900275 | 0.852125 | 0.735175 | 0.641950 | 0.580300 | 0.556125 | 0.596125 | 0.593350 | 0.556525 | 0.518900 | 0.440550 | 0.434900 | 0.424350 | 0.387525 | 0.384250 | 0.324525 | 0.271750 | 0.231550 | 0.200375 | 0.154425 | 0.120100 | 0.068525 | 0.025275 | 0.020825 | 0.016725 | 0.014900 | 0.014500 | 0.012100 | 0.010575 | 0.010425 | 0.010350 | 0.010325 | 0.008525 |
max | 0.137100 | 0.233900 | 0.305900 | 0.426400 | 0.401000 | 0.382300 | 0.372900 | 0.459000 | 0.682800 | 0.710600 | 0.734200 | 0.706000 | 0.713100 | 0.997000 | 1.000000 | 0.998800 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.965700 | 0.930600 | 1.000000 | 0.964700 | 1.000000 | 1.000000 | 0.949700 | 1.000000 | 0.985700 | 0.929700 | 0.899500 | 0.824600 | 0.773300 | 0.776200 | 0.703400 | 0.729200 | 0.552200 | 0.333900 | 0.198100 | 0.082500 | 0.100400 | 0.070900 | 0.039000 | 0.035200 | 0.044700 | 0.039400 | 0.035500 | 0.044000 | 0.036400 | 0.043900 |
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str')) y
= KNeighborsClassifier() model
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, y, scoring='accuracy', cv=cv) scores
print(f'Accuracy: {mean(scores):.3f} {std(scores):.3f}')
Accuracy: 0.797 0.073
Box-Cox Transform
= read_csv(data_path + 'sonar.csv', header=None)
df df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0200 | 0.0371 | 0.0428 | 0.0207 | 0.0954 | 0.0986 | 0.1539 | 0.1601 | 0.3109 | 0.2111 | 0.1609 | 0.1582 | 0.2238 | 0.0645 | 0.0660 | 0.2273 | 0.3100 | 0.2999 | 0.5078 | 0.4797 | 0.5783 | 0.5071 | 0.4328 | 0.5550 | 0.6711 | 0.6415 | 0.7104 | 0.8080 | 0.6791 | 0.3857 | 0.1307 | 0.2604 | 0.5121 | 0.7547 | 0.8537 | 0.8507 | 0.6692 | 0.6097 | 0.4943 | 0.2744 | 0.0510 | 0.2834 | 0.2825 | 0.4256 | 0.2641 | 0.1386 | 0.1051 | 0.1343 | 0.0383 | 0.0324 | 0.0232 | 0.0027 | 0.0065 | 0.0159 | 0.0072 | 0.0167 | 0.0180 | 0.0084 | 0.0090 | 0.0032 | R |
1 | 0.0453 | 0.0523 | 0.0843 | 0.0689 | 0.1183 | 0.2583 | 0.2156 | 0.3481 | 0.3337 | 0.2872 | 0.4918 | 0.6552 | 0.6919 | 0.7797 | 0.7464 | 0.9444 | 1.0000 | 0.8874 | 0.8024 | 0.7818 | 0.5212 | 0.4052 | 0.3957 | 0.3914 | 0.3250 | 0.3200 | 0.3271 | 0.2767 | 0.4423 | 0.2028 | 0.3788 | 0.2947 | 0.1984 | 0.2341 | 0.1306 | 0.4182 | 0.3835 | 0.1057 | 0.1840 | 0.1970 | 0.1674 | 0.0583 | 0.1401 | 0.1628 | 0.0621 | 0.0203 | 0.0530 | 0.0742 | 0.0409 | 0.0061 | 0.0125 | 0.0084 | 0.0089 | 0.0048 | 0.0094 | 0.0191 | 0.0140 | 0.0049 | 0.0052 | 0.0044 | R |
2 | 0.0262 | 0.0582 | 0.1099 | 0.1083 | 0.0974 | 0.2280 | 0.2431 | 0.3771 | 0.5598 | 0.6194 | 0.6333 | 0.7060 | 0.5544 | 0.5320 | 0.6479 | 0.6931 | 0.6759 | 0.7551 | 0.8929 | 0.8619 | 0.7974 | 0.6737 | 0.4293 | 0.3648 | 0.5331 | 0.2413 | 0.5070 | 0.8533 | 0.6036 | 0.8514 | 0.8512 | 0.5045 | 0.1862 | 0.2709 | 0.4232 | 0.3043 | 0.6116 | 0.6756 | 0.5375 | 0.4719 | 0.4647 | 0.2587 | 0.2129 | 0.2222 | 0.2111 | 0.0176 | 0.1348 | 0.0744 | 0.0130 | 0.0106 | 0.0033 | 0.0232 | 0.0166 | 0.0095 | 0.0180 | 0.0244 | 0.0316 | 0.0164 | 0.0095 | 0.0078 | R |
3 | 0.0100 | 0.0171 | 0.0623 | 0.0205 | 0.0205 | 0.0368 | 0.1098 | 0.1276 | 0.0598 | 0.1264 | 0.0881 | 0.1992 | 0.0184 | 0.2261 | 0.1729 | 0.2131 | 0.0693 | 0.2281 | 0.4060 | 0.3973 | 0.2741 | 0.3690 | 0.5556 | 0.4846 | 0.3140 | 0.5334 | 0.5256 | 0.2520 | 0.2090 | 0.3559 | 0.6260 | 0.7340 | 0.6120 | 0.3497 | 0.3953 | 0.3012 | 0.5408 | 0.8814 | 0.9857 | 0.9167 | 0.6121 | 0.5006 | 0.3210 | 0.3202 | 0.4295 | 0.3654 | 0.2655 | 0.1576 | 0.0681 | 0.0294 | 0.0241 | 0.0121 | 0.0036 | 0.0150 | 0.0085 | 0.0073 | 0.0050 | 0.0044 | 0.0040 | 0.0117 | R |
4 | 0.0762 | 0.0666 | 0.0481 | 0.0394 | 0.0590 | 0.0649 | 0.1209 | 0.2467 | 0.3564 | 0.4459 | 0.4152 | 0.3952 | 0.4256 | 0.4135 | 0.4528 | 0.5326 | 0.7306 | 0.6193 | 0.2032 | 0.4636 | 0.4148 | 0.4292 | 0.5730 | 0.5399 | 0.3161 | 0.2285 | 0.6995 | 1.0000 | 0.7262 | 0.4724 | 0.5103 | 0.5459 | 0.2881 | 0.0981 | 0.1951 | 0.4181 | 0.4604 | 0.3217 | 0.2828 | 0.2430 | 0.1979 | 0.2444 | 0.1847 | 0.0841 | 0.0692 | 0.0528 | 0.0357 | 0.0085 | 0.0230 | 0.0046 | 0.0156 | 0.0031 | 0.0054 | 0.0105 | 0.0110 | 0.0015 | 0.0072 | 0.0048 | 0.0107 | 0.0094 | R |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
203 | 0.0187 | 0.0346 | 0.0168 | 0.0177 | 0.0393 | 0.1630 | 0.2028 | 0.1694 | 0.2328 | 0.2684 | 0.3108 | 0.2933 | 0.2275 | 0.0994 | 0.1801 | 0.2200 | 0.2732 | 0.2862 | 0.2034 | 0.1740 | 0.4130 | 0.6879 | 0.8120 | 0.8453 | 0.8919 | 0.9300 | 0.9987 | 1.0000 | 0.8104 | 0.6199 | 0.6041 | 0.5547 | 0.4160 | 0.1472 | 0.0849 | 0.0608 | 0.0969 | 0.1411 | 0.1676 | 0.1200 | 0.1201 | 0.1036 | 0.1977 | 0.1339 | 0.0902 | 0.1085 | 0.1521 | 0.1363 | 0.0858 | 0.0290 | 0.0203 | 0.0116 | 0.0098 | 0.0199 | 0.0033 | 0.0101 | 0.0065 | 0.0115 | 0.0193 | 0.0157 | M |
204 | 0.0323 | 0.0101 | 0.0298 | 0.0564 | 0.0760 | 0.0958 | 0.0990 | 0.1018 | 0.1030 | 0.2154 | 0.3085 | 0.3425 | 0.2990 | 0.1402 | 0.1235 | 0.1534 | 0.1901 | 0.2429 | 0.2120 | 0.2395 | 0.3272 | 0.5949 | 0.8302 | 0.9045 | 0.9888 | 0.9912 | 0.9448 | 1.0000 | 0.9092 | 0.7412 | 0.7691 | 0.7117 | 0.5304 | 0.2131 | 0.0928 | 0.1297 | 0.1159 | 0.1226 | 0.1768 | 0.0345 | 0.1562 | 0.0824 | 0.1149 | 0.1694 | 0.0954 | 0.0080 | 0.0790 | 0.1255 | 0.0647 | 0.0179 | 0.0051 | 0.0061 | 0.0093 | 0.0135 | 0.0063 | 0.0063 | 0.0034 | 0.0032 | 0.0062 | 0.0067 | M |
205 | 0.0522 | 0.0437 | 0.0180 | 0.0292 | 0.0351 | 0.1171 | 0.1257 | 0.1178 | 0.1258 | 0.2529 | 0.2716 | 0.2374 | 0.1878 | 0.0983 | 0.0683 | 0.1503 | 0.1723 | 0.2339 | 0.1962 | 0.1395 | 0.3164 | 0.5888 | 0.7631 | 0.8473 | 0.9424 | 0.9986 | 0.9699 | 1.0000 | 0.8630 | 0.6979 | 0.7717 | 0.7305 | 0.5197 | 0.1786 | 0.1098 | 0.1446 | 0.1066 | 0.1440 | 0.1929 | 0.0325 | 0.1490 | 0.0328 | 0.0537 | 0.1309 | 0.0910 | 0.0757 | 0.1059 | 0.1005 | 0.0535 | 0.0235 | 0.0155 | 0.0160 | 0.0029 | 0.0051 | 0.0062 | 0.0089 | 0.0140 | 0.0138 | 0.0077 | 0.0031 | M |
206 | 0.0303 | 0.0353 | 0.0490 | 0.0608 | 0.0167 | 0.1354 | 0.1465 | 0.1123 | 0.1945 | 0.2354 | 0.2898 | 0.2812 | 0.1578 | 0.0273 | 0.0673 | 0.1444 | 0.2070 | 0.2645 | 0.2828 | 0.4293 | 0.5685 | 0.6990 | 0.7246 | 0.7622 | 0.9242 | 1.0000 | 0.9979 | 0.8297 | 0.7032 | 0.7141 | 0.6893 | 0.4961 | 0.2584 | 0.0969 | 0.0776 | 0.0364 | 0.1572 | 0.1823 | 0.1349 | 0.0849 | 0.0492 | 0.1367 | 0.1552 | 0.1548 | 0.1319 | 0.0985 | 0.1258 | 0.0954 | 0.0489 | 0.0241 | 0.0042 | 0.0086 | 0.0046 | 0.0126 | 0.0036 | 0.0035 | 0.0034 | 0.0079 | 0.0036 | 0.0048 | M |
207 | 0.0260 | 0.0363 | 0.0136 | 0.0272 | 0.0214 | 0.0338 | 0.0655 | 0.1400 | 0.1843 | 0.2354 | 0.2720 | 0.2442 | 0.1665 | 0.0336 | 0.1302 | 0.1708 | 0.2177 | 0.3175 | 0.3714 | 0.4552 | 0.5700 | 0.7397 | 0.8062 | 0.8837 | 0.9432 | 1.0000 | 0.9375 | 0.7603 | 0.7123 | 0.8358 | 0.7622 | 0.4567 | 0.1715 | 0.1549 | 0.1641 | 0.1869 | 0.2655 | 0.1713 | 0.0959 | 0.0768 | 0.0847 | 0.2076 | 0.2505 | 0.1862 | 0.1439 | 0.1470 | 0.0991 | 0.0041 | 0.0154 | 0.0116 | 0.0181 | 0.0146 | 0.0129 | 0.0047 | 0.0039 | 0.0061 | 0.0040 | 0.0036 | 0.0061 | 0.0115 | M |
208 rows × 61 columns
= df.values[:, :-1] data
= PowerTransformer(method='box-cox') pt
= pt.fit_transform(data) data
ValueError: The Box-Cox transformation can only be applied to strictly positive data
Yeo-Johnson Transform
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values[:, :-1] data
= PowerTransformer(method='yeo-johnson') pt
= pt.fit_transform(data) data
= DataFrame(data) df
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Train the model
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str')) y
= PowerTransformer(method='yeo-johnson') power
= KNeighborsClassifier() model
= Pipeline(steps=[('p', power), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) scores
print(f'Accuracy: {mean(scores): .3f} {std(scores): .3f}')
Accuracy: 0.808 0.082
Sometimes a lift in performance can be achieved by first standardizing the raw dataset prior to performing a Yeo-Johnson transform. We can explore this by adding a StandardScaler as a first step in the pipeline. The complete example is listed below.
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values data
= data[:, :-1], data[:,-1] X, y
= X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str')) y
= StandardScaler() scaler
= PowerTransformer(method='yeo-johnson') power
= KNeighborsClassifier() model
= Pipeline(steps=[('s', scaler), ('p', power), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) scores
print(f'Accuracy: {mean(scores): .3f}, {std(scores): .3f}')
Accuracy: 0.816, 0.077
Change Numerical Data Distributions
Quantile Transforms
from sklearn.preprocessing import QuantileTransformer
= randn(1000) data
= exp(data) data
=25)
pyplot.hist(data, bins pyplot.show()
= data.reshape(len(data), 1) data
= QuantileTransformer(output_distribution='normal') quantile
= quantile.fit_transform(data) data_trans
=25)
pyplot.hist(data_trans, bins pyplot.show()
Sonar Dataset
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values[:, :-1] data
= QuantileTransformer(n_quantiles=100, output_distribution='normal') trans
= trans.fit_transform(data) data
= DataFrame(data) df
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Next, let’s evaluate the same KNN model as the previous section, but in this case on a normal quantile transform of the dataset. The complete example is listed below.
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32') X
= LabelEncoder().fit_transform(y.astype('str')) y
= QuantileTransformer(n_quantiles=100, output_distribution='normal') trans
= KNeighborsClassifier() model
= Pipeline(steps=[('t', trans), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) scores
print(f'Accuracy: {mean(scores):.3f} {std(scores): .3f}')
Accuracy: 0.817 0.087
Uniform Quantile Transform
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values[:, :-1] data
= QuantileTransformer(n_quantiles=100, output_distribution='uniform') trans
= trans.fit_transform(data) data
= DataFrame(data) df
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Next, let’s evaluate the same KNN model as the previous section, but in this case on a uniform quantile transform of the raw dataset.
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32') X
= LabelEncoder().fit_transform(y.astype('str')) y
= QuantileTransformer(n_quantiles=100, output_distribution='uniform') trans
= KNeighborsClassifier() model
= Pipeline(steps=[('t', trans), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) scores
print(f'Accuracy: {mean(scores):.3f} {std(scores):.3f}')
Accuracy: 0.845 0.074
ed to explore the effect of the resolution of the transform on the resulting skill of the model. The example below performs this experiment and plots the mean accuracy for different n quantiles values from 1 to 99.
Transform Numerical to Categorical Data
from numpy.random import randn
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib import pyplot
= randn(1000) data
=25)
pyplot.hist(data, bins pyplot.show()
= data.reshape(len(data), 1) data
= KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform') kbins
= kbins.fit_transform(data) data_trans
print(data_trans[:10, :])
[[5.]
[6.]
[4.]
[5.]
[8.]
[6.]
[3.]
[7.]
[6.]
[4.]]
=10)
pyplot.hist(data_trans, bins pyplot.show()
Sonar Dataset
from pandas import read_csv
from matplotlib import pyplot
= read_csv(data_path + 'sonar.csv', header=None) df
print(df.shape)
(208, 61)
df.describe()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 | 208.000000 |
mean | 0.029164 | 0.038437 | 0.043832 | 0.053892 | 0.075202 | 0.104570 | 0.121747 | 0.134799 | 0.178003 | 0.208259 | 0.236013 | 0.250221 | 0.273305 | 0.296568 | 0.320201 | 0.378487 | 0.415983 | 0.452318 | 0.504812 | 0.563047 | 0.609060 | 0.624275 | 0.646975 | 0.672654 | 0.675424 | 0.699866 | 0.702155 | 0.694024 | 0.642074 | 0.580928 | 0.504475 | 0.439040 | 0.417220 | 0.403233 | 0.392571 | 0.384848 | 0.363807 | 0.339657 | 0.325800 | 0.311207 | 0.289252 | 0.278293 | 0.246542 | 0.214075 | 0.197232 | 0.160631 | 0.122453 | 0.091424 | 0.051929 | 0.020424 | 0.016069 | 0.013420 | 0.010709 | 0.010941 | 0.009290 | 0.008222 | 0.007820 | 0.007949 | 0.007941 | 0.006507 |
std | 0.022991 | 0.032960 | 0.038428 | 0.046528 | 0.055552 | 0.059105 | 0.061788 | 0.085152 | 0.118387 | 0.134416 | 0.132705 | 0.140072 | 0.140962 | 0.164474 | 0.205427 | 0.232650 | 0.263677 | 0.261529 | 0.257988 | 0.262653 | 0.257818 | 0.255883 | 0.250175 | 0.239116 | 0.244926 | 0.237228 | 0.245657 | 0.237189 | 0.240250 | 0.220749 | 0.213992 | 0.213237 | 0.206513 | 0.231242 | 0.259132 | 0.264121 | 0.239912 | 0.212973 | 0.199075 | 0.178662 | 0.171111 | 0.168728 | 0.138993 | 0.133291 | 0.151628 | 0.133938 | 0.086953 | 0.062417 | 0.035954 | 0.013665 | 0.012008 | 0.009634 | 0.007060 | 0.007301 | 0.007088 | 0.005736 | 0.005785 | 0.006470 | 0.006181 | 0.005031 |
min | 0.001500 | 0.000600 | 0.001500 | 0.005800 | 0.006700 | 0.010200 | 0.003300 | 0.005500 | 0.007500 | 0.011300 | 0.028900 | 0.023600 | 0.018400 | 0.027300 | 0.003100 | 0.016200 | 0.034900 | 0.037500 | 0.049400 | 0.065600 | 0.051200 | 0.021900 | 0.056300 | 0.023900 | 0.024000 | 0.092100 | 0.048100 | 0.028400 | 0.014400 | 0.061300 | 0.048200 | 0.040400 | 0.047700 | 0.021200 | 0.022300 | 0.008000 | 0.035100 | 0.038300 | 0.037100 | 0.011700 | 0.036000 | 0.005600 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000800 | 0.000500 | 0.001000 | 0.000600 | 0.000400 | 0.000300 | 0.000300 | 0.000100 | 0.000600 |
25% | 0.013350 | 0.016450 | 0.018950 | 0.024375 | 0.038050 | 0.067025 | 0.080900 | 0.080425 | 0.097025 | 0.111275 | 0.129250 | 0.133475 | 0.166125 | 0.175175 | 0.164625 | 0.196300 | 0.205850 | 0.242075 | 0.299075 | 0.350625 | 0.399725 | 0.406925 | 0.450225 | 0.540725 | 0.525800 | 0.544175 | 0.531900 | 0.534775 | 0.463700 | 0.411400 | 0.345550 | 0.281400 | 0.257875 | 0.217575 | 0.179375 | 0.154350 | 0.160100 | 0.174275 | 0.173975 | 0.186450 | 0.163100 | 0.158900 | 0.155200 | 0.126875 | 0.094475 | 0.068550 | 0.064250 | 0.045125 | 0.026350 | 0.011550 | 0.008425 | 0.007275 | 0.005075 | 0.005375 | 0.004150 | 0.004400 | 0.003700 | 0.003600 | 0.003675 | 0.003100 |
50% | 0.022800 | 0.030800 | 0.034300 | 0.044050 | 0.062500 | 0.092150 | 0.106950 | 0.112100 | 0.152250 | 0.182400 | 0.224800 | 0.249050 | 0.263950 | 0.281100 | 0.281700 | 0.304700 | 0.308400 | 0.368300 | 0.434950 | 0.542500 | 0.617700 | 0.664900 | 0.699700 | 0.698500 | 0.721100 | 0.754500 | 0.745600 | 0.731900 | 0.680800 | 0.607150 | 0.490350 | 0.429600 | 0.391200 | 0.351050 | 0.312750 | 0.321150 | 0.306300 | 0.312700 | 0.283500 | 0.278050 | 0.259500 | 0.245100 | 0.222550 | 0.177700 | 0.148000 | 0.121350 | 0.101650 | 0.078100 | 0.044700 | 0.017900 | 0.013900 | 0.011400 | 0.009550 | 0.009300 | 0.007500 | 0.006850 | 0.005950 | 0.005800 | 0.006400 | 0.005300 |
75% | 0.035550 | 0.047950 | 0.057950 | 0.064500 | 0.100275 | 0.134125 | 0.154000 | 0.169600 | 0.233425 | 0.268700 | 0.301650 | 0.331250 | 0.351250 | 0.386175 | 0.452925 | 0.535725 | 0.659425 | 0.679050 | 0.731400 | 0.809325 | 0.816975 | 0.831975 | 0.848575 | 0.872175 | 0.873725 | 0.893800 | 0.917100 | 0.900275 | 0.852125 | 0.735175 | 0.641950 | 0.580300 | 0.556125 | 0.596125 | 0.593350 | 0.556525 | 0.518900 | 0.440550 | 0.434900 | 0.424350 | 0.387525 | 0.384250 | 0.324525 | 0.271750 | 0.231550 | 0.200375 | 0.154425 | 0.120100 | 0.068525 | 0.025275 | 0.020825 | 0.016725 | 0.014900 | 0.014500 | 0.012100 | 0.010575 | 0.010425 | 0.010350 | 0.010325 | 0.008525 |
max | 0.137100 | 0.233900 | 0.305900 | 0.426400 | 0.401000 | 0.382300 | 0.372900 | 0.459000 | 0.682800 | 0.710600 | 0.734200 | 0.706000 | 0.713100 | 0.997000 | 1.000000 | 0.998800 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.965700 | 0.930600 | 1.000000 | 0.964700 | 1.000000 | 1.000000 | 0.949700 | 1.000000 | 0.985700 | 0.929700 | 0.899500 | 0.824600 | 0.773300 | 0.776200 | 0.703400 | 0.729200 | 0.552200 | 0.333900 | 0.198100 | 0.082500 | 0.100400 | 0.070900 | 0.039000 | 0.035200 | 0.044700 | 0.039400 | 0.035500 | 0.044000 | 0.036400 | 0.043900 |
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Let’s fit and evaluate a machine learning model on the raw dataset.
from numpy import mean, std
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str')) y
= KNeighborsClassifier() model
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, y, scoring='accuracy', cv=cv) n_scores
print(f'Accuracy: {mean(n_scores): .3f}, {std(n_scores): .3f}')
Accuracy: 0.797, 0.073
Uniform Discretization Transform
from pandas import DataFrame
from sklearn.pipeline import Pipeline
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values[:, :-1] data
= KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform') trans
= trans.fit_transform(data) data
= DataFrame(data) df
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Next, let’s evaluate the same KNN model as the previous section
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str')) y
= KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform') trans
= KNeighborsClassifier() model
= Pipeline(steps = [('t', trans), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) n_scores
print(f'Accuracy: {mean(n_scores):.3f}, {std(n_scores):.3f}')
Accuracy: 0.829, 0.079
k-Means Discretization Transform
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values[:, :-1] data
= KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans') trans
= trans.fit_transform(data) data
= DataFrame(data) df
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Next, let’s evaluate the same KNN model
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str')) y
= KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans') trans
= KNeighborsClassifier() model
= Pipeline(steps=[('t', trans), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) n_scores
print(f'Accuracy: {mean(n_scores):.3f}, {std(n_scores):.3f}')
Accuracy: 0.814, 0.084
Quantile Discretization Transform
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values[:, :-1] data
= KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') trans
= trans.fit_transform(data) data
= DataFrame(data) df
= df.hist(xlabelsize=4, ylabelsize=4)
fig 4) for x in fig.ravel()]
[x.title.set_size( pyplot.show()
Next, let’s evaluate the same KNN model
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str')) y
= KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') trans
= KNeighborsClassifier() model
= Pipeline(steps=[('t', trans), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) n_scores
print(f'Accuracy: {mean(n_scores):.3f} {std(n_scores): .3f}')
Accuracy: 0.840 0.072
We chose the number of bins as an arbitrary number; in this case, 10. This hyperparameter can be tuned to explore the effect of the resolution of the transform on the resulting skill of the model.
def get_dataset(filename):
= read_csv(filename, header=None)
df = df.values
data = data[:, :-1], data[:, -1]
X, y = X.astype('float32')
X = LabelEncoder().fit_transform(y.astype('str'))
y return X, y
def get_models():
= dict()
models for i in range(2, 11):
= KBinsDiscretizer(n_bins=i, encode='ordinal', strategy='quantile')
trans = KNeighborsClassifier()
model str(i)] = Pipeline(steps=[('t', trans), ('m', model)])
models[return models
def evaluate_model(model, X, y):
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv = cross_val_score(model, X, y, scoring='accuracy', cv=cv)
scores return scores
= get_dataset() X, y
= get_models() models
= list(), list() results, names
for name, model in models.items():
= evaluate_model(model, X, y)
scores
results.append(scores)
names.append(name)print(f'{name}: {mean(scores): .3f} {mean(scores): .3f}')
2: 0.822 0.822
3: 0.870 0.870
4: 0.838 0.838
5: 0.838 0.838
6: 0.844 0.844
7: 0.852 0.852
8: 0.838 0.838
9: 0.841 0.841
10: 0.840 0.840
=names, showmeans=True)
pyplot.boxplot(results, labels pyplot.show()
Derive New Input Variables
Polynomial Feature Transform
from numpy import asarray
from sklearn.preprocessing import PolynomialFeatures
= asarray([[2,3], [2,3], [2,3]])
data print(data)
[[2 3]
[2 3]
[2 3]]
= PolynomialFeatures(degree=2) trans
= trans.fit_transform(data) data
print(data)
[[1. 2. 3. 4. 6. 9.]
[1. 2. 3. 4. 6. 9.]
[1. 2. 3. 4. 6. 9.]]
Polynomial Feature Transform Example
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values[:, :-1] data
= PolynomialFeatures(degree=3) trans
= trans.fit_transform(data) data
= DataFrame(data) df
print(df.shape)
(208, 39711)
Next, let’s evaluate the same KNN model
= read_csv(data_path + 'sonar.csv', header=None) df
= df.values data
= data[:, :-1], data[:, -1] X, y
= X.astype('float32') X
= LabelEncoder().fit_transform(y.astype('str')) y
= PolynomialFeatures(degree=3) trans
= KNeighborsClassifier() model
= Pipeline(steps=[('t', trans), ('m', model)]) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) n_scores
print(f'Accuracy: {mean(n_scores):.3f} {std(n_scores): .3f}')
Accuracy: 0.800 0.077
Effect of Polynomial Degree
= get_dataset(data_path + 'sonar.csv') X, y
= list() num_feature
= [i for i in range(1, 6)] degrees
for d in degrees:
= PolynomialFeatures(degree=d)
trans = trans.fit_transform(X)
data 1])
num_feature.append(data.shape[print(f'Degree: {d}, Features: {data.shape[1]}')
Degree: 1, Features: 61
Degree: 2, Features: 1891
Degree: 3, Features: 39711
Degree: 4, Features: 635376
Degree: 5, Features: 8259888
pyplot.plot(degrees, num_feature) pyplot.show()
It may be a good idea to treat the degree for the polynomial features transform as a hyperparameter and test different values for your dataset.
def get_models():
= dict()
models for d in range(1,5):
= PolynomialFeatures(degree=d)
trans = KNeighborsClassifier()
model str(d)] = Pipeline(steps=[('t', trans), ('m', model)])
models[return models
= get_dataset(data_path + 'sonar.csv') X, y
= get_models() models
= list(), list() results, names
for name, model in models.items():
= evaluate_model(model, X, y)
scores
results.append(scores)
names.append(name)print(f'{name}: {mean(scores):.3f} {std(scores):.3f}')
1: 0.797 0.073
2: 0.793 0.085
3: 0.800 0.077
4: 0.795 0.079
=names, showmeans=True)
pyplot.boxplot(results, labels pyplot.show()
Advanced Transforms
Transform Both Numerical and Categorical Data
Data Preparation for the Abalone Regression Dataset
from numpy import absolute
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.model_selection import KFold
= read_csv(data_path + 'abalone.csv', header=None) df
= len(df.columns) - 1 last_ix
= df.drop(last_ix, axis=1), df[last_ix] X, y
print(X.shape, y.shape)
(4177, 8) (4177,)
= X.select_dtypes(include=['int64', 'float64']).columns
numerical_ix = X.select_dtypes(include=['object', 'bool']).columns categorical_ix
= [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)] t
= ColumnTransformer(transformers=t) col_transform
= SVR(kernel='rbf', gamma = 'scale', C=100) model
= Pipeline(steps=[('prep', col_transform), ('m', model)]) pipeline
= KFold(n_splits=10, shuffle=True, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv) scores
= absolute(scores) scores
print(f'MAE: {mean(scores):.3f} {std(scores): .3f}')
MAE: 1.465 0.047
Transform the Target in Regression
Example of Using the TransformedTargetRegressor
from numpy import loadtxt
from sklearn.linear_model import HuberRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import RepeatedKFold
= loadtxt(data_path + 'boston-housing.csv') data
= data[:, :-1], data[:, -1] X, y
= Pipeline(steps=[('normalize', MinMaxScaler()), ('model', HuberRegressor())]) pipeline
= TransformedTargetRegressor(regressor=pipeline, transformer=MinMaxScaler()) model
= RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv) scores
=absolute(scores) scores
print(f'Mean: {mean(scores):.3f}')
Mean: 3.203
We are not restricted to using scaling objects; for example, we can also explore using other data transforms on the target variable, such as the PowerTransformer
from sklearn.preprocessing import PowerTransformer
= loadtxt(data_path + 'boston-housing.csv') data
= data[:, :-1], data[:, -1] X, y
= list() steps
'scale', MinMaxScaler(feature_range=(1e-5, 1))))
steps.append(('power', PowerTransformer()))
steps.append(('model', HuberRegressor())) steps.append((
= Pipeline(steps=steps) pipeline
= TransformedTargetRegressor(regressor=pipeline, transformer=PowerTransformer()) model
= RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv) scores
= absolute(scores) scores
print(f'Mean: {mean(scores): .3f}')
Mean: 2.972
How to Save and Load Data Transforms
Worked Example of Saving Data Preparatio
Define a Dataset
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
= make_blobs(n_samples=100, centers=2, n_features=2, random_state=1) X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
for i in range(X_test.shape[1]):
print(f'{i} > train: min={X_train[:, i].min():.3f}, max={X_train[:,i].max(): .3f}, test: min={X_test[:,i].min():.3f}, max={X_test[:,i].max():.3f}')
0 > train: min=-11.856, max= 0.526, test: min=-11.270, max=0.085
1 > train: min=-6.388, max= 6.507, test: min=-5.581, max=5.926
Scale the Dataset
from sklearn.linear_model import LogisticRegression
from pickle import dump
= make_blobs(n_samples=100, centers=2, n_features=2, random_state=1) X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= MinMaxScaler() scaler
scaler.fit(X_train)
MinMaxScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MinMaxScaler()
= scaler.transform(X_train) X_train_scaled
= scaler.transform(X_test) X_test_scaled
for i in range(X_test.shape[1]):
print(f'{i} > train: min={X_train_scaled[:, i].min():.3f}, max={X_train_scaled[:,i].max(): .3f}, test: min={X_test_scaled[:,i].min():.3f}, max={X_test_scaled[:,i].max():.3f}')
0 > train: min=0.000, max= 1.000, test: min=0.047, max=0.964
1 > train: min=0.000, max= 1.000, test: min=0.063, max=0.955
Save Model and Data Scaler
= make_blobs(n_samples=100, centers=2, n_features=2, random_state=1) X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_tarin, _, y_train, _
= MinMaxScaler() scaler
scaler.fit(X_tarin)
MinMaxScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MinMaxScaler()
= scaler.transform(X_train) X_train_scaled
= LogisticRegression(solver='lbfgs') model
model.fit(X_train_scaled, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
open('model.pkl', 'wb')) dump(model,
open('scaler.pkl', 'wb')) dump(scaler,
Load Model and Data Scaler
from pickle import load
= make_blobs(n_samples=100, centers=2, n_features=2, random_state=1) X, y
= train_test_split(X, y, test_size=0.33, random_state=1) _, X_test, _, y_test
= load(open('model.pkl')) scaler