Dimensionality Reduction

from numpy import mean, std
from matplotlib import pyplot
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

LDA Dimensionality Reduction

Worked Example of LDA for Dimensionality

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                           n_redundant=5, random_state=7, n_classes=10)

steps = [('lda', LinearDiscriminantAnalysis(n_components=5)), ('m', GaussianNB())]

model = Pipeline(steps=steps)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)

print(f'Accuracy: {mean(n_scores): .3f}, {std(n_scores): .3f}')

Accuracy:  0.314,  0.049

How do we know that reducing 20 dimensions of input down to five is good or the best we can do?

def get_dataset():
    X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                              n_redundant=5, random_state=7, n_classes=10)
    return X, y

def get_models():
    models = dict()
    for i in range(1,10):
        steps = [('lda', LinearDiscriminantAnalysis(n_components=i)), ('m', GaussianNB())]
        models[str(i)] = Pipeline(steps=steps)
    return models

def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)
    return scores

X, y = get_dataset()

models = get_models()

results, names = list(), list()

for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print(f'{name} > {mean(scores):.3f}  {std(scores): .3f}')

1 > 0.182   0.032
2 > 0.235   0.036
3 > 0.267   0.038
4 > 0.303   0.037
5 > 0.314   0.049
6 > 0.314   0.040
7 > 0.329   0.042
8 > 0.343   0.045
9 > 0.358   0.056

pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

We may choose to use an LDA transform and Naive Bayes model combination as our final model. This involves fitting the Pipeline on all available data and using the pipeline to make predictions on new data.

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                           n_redundant=5, random_state=7, n_classes=10)

steps = [('lda', LinearDiscriminantAnalysis(n_components=9)), ('m', GaussianNB())]

model = Pipeline(steps=steps)

model.fit(X, y)

Pipeline(steps=[('lda', LinearDiscriminantAnalysis(n_components=9)),
                ('m', GaussianNB())])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

row = [[2.3548775, -1.69674567, 1.6193882, -1.19668862, -2.85422348, -2.00998376,
16.56128782, 2.57257575, 9.93779782, 0.43415008, 6.08274911, 2.12689336, 1.70100279,
3.32160983, 13.02048541, -3.05034488, 2.06346747, -3.33390362, 2.45147541, -1.23455205]]

yhat = model.predict(row)

print(f'Predicted Class: {yhat[0]}')

Predicted Class: 6

PCA Dimensionality Reduction

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                          n_redundant=5, random_state=7)

steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]

model = Pipeline(steps=steps)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)

print(f'Accuracy: {mean(n_scores):.3f} {std(n_scores):.3f}')

Accuracy: 0.816 0.034

How do we know that reducing 20 dimensions of input down to 10 is good or the best we can do?

def get_models():
    modesl = dict()
    for i in range(1, 21):
        steps = [('pca', PCA(n_components=i)), ('m', LogisticRegression())]
        models[str(i)] = Pipeline(steps=steps)
    return models

X, y = get_dataset()

models = get_models()

results, names = list(), list()

for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print(f'{name} > {mean(scores): .3f}  {mean(scores): .3f}')

1 >  0.140   0.140
2 >  0.147   0.147
3 >  0.152   0.152
4 >  0.176   0.176
5 >  0.171   0.171
6 >  0.205   0.205
7 >  0.240   0.240
8 >  0.263   0.263
9 >  0.274   0.274
10 >  0.285   0.285
11 >  0.287   0.287
12 >  0.305   0.305
13 >  0.311   0.311
14 >  0.306   0.306
15 >  0.323   0.323
16 >  0.323   0.323
17 >  0.323   0.323
18 >  0.323   0.323
19 >  0.323   0.323
20 >  0.323   0.323

pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.xticks(rotation=45)
pyplot.show()

The example below provides an example of fitting and using a final model with PCA transforms on new data.

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                          n_redundant=5, random_state=7)

steps = [('pca', PCA(n_components=15)), ('m', LogisticRegression())]

model = Pipeline(steps=steps)

model.fit(X, y)

Pipeline(steps=[('pca', PCA(n_components=15)), ('m', LogisticRegression())])

row = [[0.2929949, -4.21223056, -1.288332, -2.17849815, -0.64527665, 2.58097719,
0.28422388, -7.1827928, -1.91211104, 2.73729512, 0.81395695, 3.96973717, -2.66939799,
3.34692332, 4.19791821, 0.99990998, -0.30201875, -4.43170633, -2.82646737, 0.44916808]]

yhat = model.predict(row)

print(f'Predicted Class: {yhat[0]}')

Predicted Class: 1

SVD Dimensionality Reduction

Worked Example of SVD for Dimensionality

from sklearn.decomposition import TruncatedSVD

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                          n_redundant=5, random_state=7)

steps = [('svd', TruncatedSVD(n_components=10)), ('m', LogisticRegression())]

model = Pipeline(steps=steps)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)

print(f'Accuracy: {mean(n_scores): .3f}  {std(n_scores): .3f}')

Accuracy:  0.814   0.034

How do we know that reducing 20 dimensions of input down to 10 is good or the best we can do?

def get_models():
    models = dict()
    for i in range(1, 20):
        steps = [('svd', TruncatedSVD(n_components=i)), ('m', LogisticRegression())]
        models[str(i)] = Pipeline(steps=steps)
    return models

X, y = get_dataset()

models = get_models()

results, names = list(), list()

for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print(f'{name}> {mean(scores):.3f}  {std(scores):.3f}')

1> 0.140  0.024
2> 0.147  0.021
3> 0.152  0.023
4> 0.177  0.032
5> 0.171  0.036
6> 0.204  0.038
7> 0.236  0.037
8> 0.265  0.035
9> 0.279  0.036
10> 0.288  0.035
11> 0.289  0.034
12> 0.306  0.037
13> 0.309  0.037
14> 0.308  0.033
15> 0.323  0.039
16> 0.323  0.039
17> 0.323  0.039
18> 0.323  0.039
19> 0.323  0.039

pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.xticks(rotation=45)
pyplot.show()

We may choose to use an SVD transform and logistic regression model combination as our final model.

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                          n_redundant=5, random_state=7)

steps = [('svd', TruncatedSVD(n_components=15)), ('m', LogisticRegression())]

model = Pipeline(steps=steps)

model.fit(X, y)

Pipeline(steps=[('svd', TruncatedSVD(n_components=15)),
                ('m', LogisticRegression())])

row = [[0.2929949, -4.21223056, -1.288332, -2.17849815, -0.64527665, 2.58097719,
0.28422388, -7.1827928, -1.91211104, 2.73729512, 0.81395695, 3.96973717, -2.66939799,
3.34692332, 4.19791821, 0.99990998, -0.30201875, -4.43170633, -2.82646737, 0.44916808]]

yhat = model.predict(row)

print(f'Predicted Class: {yhat[0]}')

Predicted Class: 1