from numpy import where
from matplotlib import pyplot
from sklearn.datasets import make_blobs
Intuition for Imbalanced Classification
Foundation
Create and Plot a Binary Classification Problem
= make_blobs(n_samples=1000, centers=2 ,n_features=2, cluster_std=3, random_state=1) X, y
for class_value in range(2):
= where(y == class_value)
row_ix 0], X[row_ix, 1])
pyplot.scatter(X[row_ix,
pyplot.show()
Create Synthetic Dataset with a Class Distribution
from numpy import unique, hstack, vstack, where
from matplotlib import pyplot
from sklearn.datasets import make_blobs
def get_dataset(proportions):
= len(proportions)
n_classes = max([v for k,v in proportions.items()])
largest = largest * n_classes
n_samples = make_blobs(n_samples=n_samples, centers=n_classes, n_features=2,
X, y =3, random_state=1)
cluster_std
= [], []
X_list, y_list for k, v in proportions.items():
= where(y == k)[0]
row_ix = row_ix[:v]
selected
X_list.append(X[selected, :])
y_list.append(y[selected])
return vstack(X_list), hstack(y_list)
def plot_dataset(X, y):
= len(unique(y))
n_classes for class_value in range(n_classes):
= where(y == class_value)[0]
row_ix 0], X[row_ix, 1], label=str(class_value))
pyplot.scatter(X[row_ix,
pyplot.legend() pyplot.show()
= {0:5000, 1:5000} proportions
= get_dataset(proportions) X, y
plot_dataset(X, y)
Effect of Skewed Class Distributions
1:10 Imbalanced Class Distribution
= {0:10000, 1:1000}
proportions = get_dataset(proportions)
X, y plot_dataset(X, y)
1:100 Imbalanced Class Distribution
= {0:10000, 1:100}
proportions = get_dataset(proportions)
X, y plot_dataset(X, y)
1:1000 Imbalanced Class Distribution
= {0:10000, 1:10}
proportions = get_dataset(proportions)
X, y plot_dataset(X, y)
Challenge of Imbalanced Classification
from matplotlib import pyplot
from numpy import where
from collections import Counter
from sklearn.datasets import make_classification
Compounding Effect of Dataset Size
= [100, 1000, 10000, 100000] sizes
for i in range(len(sizes)):
= sizes[i]
n = make_classification(n_samples=n, n_features=2, n_redundant=0, n_clusters_per_class=1,
X, y =[0.99], flip_y=0, random_state=1)
weights= Counter(y)
counter print(f'Size={n}, Ratio={counter}')
2, 2, 1+i)
pyplot.subplot('n=%d' % n)
pyplot.title(
pyplot.xticks([])
pyplot.yticks([])
for label, _ in counter.items():
= where(y == label)[0]
row_ix 0], X[row_ix, 1], label=str(label))
pyplot.scatter(X[row_ix,
pyplot.legend()
pyplot.show()
Size=100, Ratio=Counter({0: 99, 1: 1})
Size=1000, Ratio=Counter({0: 990, 1: 10})
Size=10000, Ratio=Counter({0: 9900, 1: 100})
Size=100000, Ratio=Counter({0: 99000, 1: 1000})
Compounding Effect of Label Noise
= [0, 0.01, 0.05, 0.07] noise
for i in range(len(noise)):
= noise[i]
n = make_classification(n_samples=1000, n_features=2, n_redundant=0,
X, y =1, weights=[0.99], flip_y=n, random_state=1)
n_clusters_per_class= Counter(y)
counter print(f'Noise= {int(n*100)}, Ratio= {counter}')
2, 2, 1+i)
pyplot.subplot(f'noise= {int(n*100)}')
pyplot.title(
pyplot.xticks([])
pyplot.yticks([])
for label, _ in counter.items():
= where(y == label)[0]
row_ix 0], X[row_ix, 1], label=str(label))
pyplot.scatter(X[row_ix,
pyplot.legend()
pyplot.show()
Noise= 0, Ratio= Counter({0: 990, 1: 10})
Noise= 1, Ratio= Counter({0: 983, 1: 17})
Noise= 5, Ratio= Counter({0: 963, 1: 37})
Noise= 7, Ratio= Counter({0: 959, 1: 41})
Compounding Effect of Data Distribution
= [1, 2] clusters
for i in range(len(clusters)):
= clusters[i]
c = make_classification(n_samples=10000, n_features=2, n_redundant=0,
X, y =c, weights=[0.99], flip_y=0, random_state=1)
n_clusters_per_class= Counter(y)
counter
1, 2, 1+i)
pyplot.subplot(f'Clusters= {c}')
pyplot.title(
pyplot.xticks([])
pyplot.yticks([])
for label, _ in counter.items():
= where(y == label)[0]
row_ix 0], X[row_ix, 1], label=str(label))
pyplot.scatter(X[row_ix,
pyplot.legend()
pyplot.show()