Data Cleaning

data_path = '/home/naji/Desktop/github-repos/machine-learning/nbs/0-datasets/data/'
from numpy import loadtxt
from numpy import unique
from pandas import read_csv

Basic Data Cleaning

Identify Columns That Contain a Single Value

data_path = '/home/naji/Desktop/github-repos/machine-learning/nbs/0-datasets/data/'
data = loadtxt(data_path + 'oil-spill.csv', delimiter=',')

Method 1

data
array([[1.00000e+00, 2.55800e+03, 1.50609e+03, ..., 6.57400e+01,
        7.95000e+00, 1.00000e+00],
       [2.00000e+00, 2.23250e+04, 7.91100e+01, ..., 6.57300e+01,
        6.26000e+00, 0.00000e+00],
       [3.00000e+00, 1.15000e+02, 1.44985e+03, ..., 6.58100e+01,
        7.84000e+00, 1.00000e+00],
       ...,
       [2.02000e+02, 1.40000e+01, 2.51400e+01, ..., 6.59100e+01,
        6.12000e+00, 0.00000e+00],
       [2.03000e+02, 1.00000e+01, 9.60000e+01, ..., 6.59700e+01,
        6.32000e+00, 0.00000e+00],
       [2.04000e+02, 1.10000e+01, 7.73000e+00, ..., 6.56500e+01,
        6.26000e+00, 0.00000e+00]])
data.shape
(937, 50)
for i in range(data.shape[1]):
    print(i, len(unique(data[:,i])))
0 238
1 297
2 927
3 933
4 179
5 375
6 820
7 618
8 561
9 57
10 577
11 59
12 73
13 107
14 53
15 91
16 893
17 810
18 170
19 53
20 68
21 9
22 1
23 92
24 9
25 8
26 9
27 308
28 447
29 392
30 107
31 42
32 4
33 45
34 141
35 110
36 3
37 758
38 9
39 9
40 388
41 220
42 644
43 649
44 499
45 2
46 937
47 169
48 286
49 2

Method 2

df = read_csv(data_path + 'oil-spill.csv', header=None)
df
0 1 2 3 4 5 6 7 8 9 ... 40 41 42 43 44 45 46 47 48 49
0 1 2558 1506.09 456.63 90 6395000.0 40.88 7.89 29780.0 0.19 ... 2850.00 1000.00 763.16 135.46 3.73 0 33243.19 65.74 7.95 1
1 2 22325 79.11 841.03 180 55812500.0 51.11 1.21 61900.0 0.02 ... 5750.00 11500.00 9593.48 1648.80 0.60 0 51572.04 65.73 6.26 0
2 3 115 1449.85 608.43 88 287500.0 40.42 7.34 3340.0 0.18 ... 1400.00 250.00 150.00 45.13 9.33 1 31692.84 65.81 7.84 1
3 4 1201 1562.53 295.65 66 3002500.0 42.40 7.97 18030.0 0.19 ... 6041.52 761.58 453.21 144.97 13.33 1 37696.21 65.67 8.07 1
4 5 312 950.27 440.86 37 780000.0 41.43 7.03 3350.0 0.17 ... 1320.04 710.63 512.54 109.16 2.58 0 29038.17 65.66 7.35 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
932 200 12 92.42 364.42 135 97200.0 59.42 10.34 884.0 0.17 ... 381.84 254.56 84.85 146.97 4.50 0 2593.50 65.85 6.39 0
933 201 11 98.82 248.64 159 89100.0 59.64 10.18 831.0 0.17 ... 284.60 180.00 150.00 51.96 1.90 0 4361.25 65.70 6.53 0
934 202 14 25.14 428.86 24 113400.0 60.14 17.94 847.0 0.30 ... 402.49 180.00 180.00 0.00 2.24 0 2153.05 65.91 6.12 0
935 203 10 96.00 451.30 68 81000.0 59.90 15.01 831.0 0.25 ... 402.49 180.00 90.00 73.48 4.47 0 2421.43 65.97 6.32 0
936 204 11 7.73 235.73 135 89100.0 61.82 12.24 831.0 0.20 ... 254.56 254.56 127.28 180.00 2.00 0 3782.68 65.65 6.26 0

937 rows × 50 columns

print(df.nunique())
0     238
1     297
2     927
3     933
4     179
5     375
6     820
7     618
8     561
9      57
10    577
11     59
12     73
13    107
14     53
15     91
16    893
17    810
18    170
19     53
20     68
21      9
22      1
23     92
24      9
25      8
26      9
27    308
28    447
29    392
30    107
31     42
32      4
33     45
34    141
35    110
36      3
37    758
38      9
39      9
40    388
41    220
42    644
43    649
44    499
45      2
46    937
47    169
48    286
49      2
dtype: int64

Delete Columns That Contain a Single Value

df = read_csv(data_path + 'oil-spill.csv', header=None)
df.shape
(937, 50)
counts= df.nunique()
counts
0     238
1     297
2     927
3     933
4     179
5     375
6     820
7     618
8     561
9      57
10    577
11     59
12     73
13    107
14     53
15     91
16    893
17    810
18    170
19     53
20     68
21      9
22      1
23     92
24      9
25      8
26      9
27    308
28    447
29    392
30    107
31     42
32      4
33     45
34    141
35    110
36      3
37    758
38      9
39      9
40    388
41    220
42    644
43    649
44    499
45      2
46    937
47    169
48    286
49      2
dtype: int64
to_del = [i for i, v in enumerate(counts) if v==1]
print(to_del)
[22]
df.drop(to_del, axis=1, inplace=True)
print(df.shape)
(937, 49)

Consider Columns That Have Very Few Values

data = loadtxt(data_path + 'oil-spill.csv', delimiter=',')
data
array([[1.00000e+00, 2.55800e+03, 1.50609e+03, ..., 6.57400e+01,
        7.95000e+00, 1.00000e+00],
       [2.00000e+00, 2.23250e+04, 7.91100e+01, ..., 6.57300e+01,
        6.26000e+00, 0.00000e+00],
       [3.00000e+00, 1.15000e+02, 1.44985e+03, ..., 6.58100e+01,
        7.84000e+00, 1.00000e+00],
       ...,
       [2.02000e+02, 1.40000e+01, 2.51400e+01, ..., 6.59100e+01,
        6.12000e+00, 0.00000e+00],
       [2.03000e+02, 1.00000e+01, 9.60000e+01, ..., 6.59700e+01,
        6.32000e+00, 0.00000e+00],
       [2.04000e+02, 1.10000e+01, 7.73000e+00, ..., 6.56500e+01,
        6.26000e+00, 0.00000e+00]])
for i in range(data.shape[1]):
    num = len(unique(data[:, i]))
    percentage = float(num) / data.shape[0]*100
    print(f'{i}, {num}, {percentage: 0.1f}%')
0, 238,  25.4%
1, 297,  31.7%
2, 927,  98.9%
3, 933,  99.6%
4, 179,  19.1%
5, 375,  40.0%
6, 820,  87.5%
7, 618,  66.0%
8, 561,  59.9%
9, 57,  6.1%
10, 577,  61.6%
11, 59,  6.3%
12, 73,  7.8%
13, 107,  11.4%
14, 53,  5.7%
15, 91,  9.7%
16, 893,  95.3%
17, 810,  86.4%
18, 170,  18.1%
19, 53,  5.7%
20, 68,  7.3%
21, 9,  1.0%
22, 1,  0.1%
23, 92,  9.8%
24, 9,  1.0%
25, 8,  0.9%
26, 9,  1.0%
27, 308,  32.9%
28, 447,  47.7%
29, 392,  41.8%
30, 107,  11.4%
31, 42,  4.5%
32, 4,  0.4%
33, 45,  4.8%
34, 141,  15.0%
35, 110,  11.7%
36, 3,  0.3%
37, 758,  80.9%
38, 9,  1.0%
39, 9,  1.0%
40, 388,  41.4%
41, 220,  23.5%
42, 644,  68.7%
43, 649,  69.3%
44, 499,  53.3%
45, 2,  0.2%
46, 937,  100.0%
47, 169,  18.0%
48, 286,  30.5%
49, 2,  0.2%
for i in range(data.shape[1]):
    num = len(unique(data[:, i]))
    percentage = float(num) / data.shape[0] * 100
    if percentage < 1:
        print(f'{i}, {num}, {percentage: .1f}%')
21, 9,  1.0%
22, 1,  0.1%
24, 9,  1.0%
25, 8,  0.9%
26, 9,  1.0%
32, 4,  0.4%
36, 3,  0.3%
38, 9,  1.0%
39, 9,  1.0%
45, 2,  0.2%
49, 2,  0.2%
counts = df.nunique()
to_del = [i for i,v in enumerate(counts) if (float(v)/df.shape[0]*100) < 1]
to_del
[21, 23, 24, 25, 31, 35, 37, 38, 44, 48]
df.drop(to_del, axis=1, inplace=True)
df.shape
(937, 39)

Remove Columns That Have A Low Variance

from numpy import arange
from pandas import read_csv
from sklearn.feature_selection import VarianceThreshold
from matplotlib import pyplot
df = read_csv(data_path + 'oil-spill.csv', header=None)
df
0 1 2 3 4 5 6 7 8 9 ... 40 41 42 43 44 45 46 47 48 49
0 1 2558 1506.09 456.63 90 6395000.0 40.88 7.89 29780.0 0.19 ... 2850.00 1000.00 763.16 135.46 3.73 0 33243.19 65.74 7.95 1
1 2 22325 79.11 841.03 180 55812500.0 51.11 1.21 61900.0 0.02 ... 5750.00 11500.00 9593.48 1648.80 0.60 0 51572.04 65.73 6.26 0
2 3 115 1449.85 608.43 88 287500.0 40.42 7.34 3340.0 0.18 ... 1400.00 250.00 150.00 45.13 9.33 1 31692.84 65.81 7.84 1
3 4 1201 1562.53 295.65 66 3002500.0 42.40 7.97 18030.0 0.19 ... 6041.52 761.58 453.21 144.97 13.33 1 37696.21 65.67 8.07 1
4 5 312 950.27 440.86 37 780000.0 41.43 7.03 3350.0 0.17 ... 1320.04 710.63 512.54 109.16 2.58 0 29038.17 65.66 7.35 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
932 200 12 92.42 364.42 135 97200.0 59.42 10.34 884.0 0.17 ... 381.84 254.56 84.85 146.97 4.50 0 2593.50 65.85 6.39 0
933 201 11 98.82 248.64 159 89100.0 59.64 10.18 831.0 0.17 ... 284.60 180.00 150.00 51.96 1.90 0 4361.25 65.70 6.53 0
934 202 14 25.14 428.86 24 113400.0 60.14 17.94 847.0 0.30 ... 402.49 180.00 180.00 0.00 2.24 0 2153.05 65.91 6.12 0
935 203 10 96.00 451.30 68 81000.0 59.90 15.01 831.0 0.25 ... 402.49 180.00 90.00 73.48 4.47 0 2421.43 65.97 6.32 0
936 204 11 7.73 235.73 135 89100.0 61.82 12.24 831.0 0.20 ... 254.56 254.56 127.28 180.00 2.00 0 3782.68 65.65 6.26 0

937 rows × 50 columns

data = df.values
X = data[:, :-1]
y = data[:, -1]
print(X.shape, y.shape)
(937, 49) (937,)
thresholds = arange(0, 0.55, 0.05)
results = list()

for t in thresholds:
    transform = VarianceThreshold(threshold=t)
    X_sel = transform.fit_transform(X)
    n_features = X_sel.shape[1]
    print(f'> Threshold={t: .2f}, Features={n_features}')
    results.append(n_features)
> Threshold= 0.00, Features=48
> Threshold= 0.05, Features=37
> Threshold= 0.10, Features=36
> Threshold= 0.15, Features=35
> Threshold= 0.20, Features=35
> Threshold= 0.25, Features=35
> Threshold= 0.30, Features=35
> Threshold= 0.35, Features=35
> Threshold= 0.40, Features=35
> Threshold= 0.45, Features=33
> Threshold= 0.50, Features=31
pyplot.plot(thresholds, results)
pyplot.show()

Identify Rows That Contain Duplicate Data

df = read_csv(data_path + 'iris.csv')
df
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica

150 rows × 5 columns

dups = df.duplicated()
dups
0      False
1      False
2      False
3      False
4      False
       ...  
145    False
146    False
147    False
148    False
149    False
Length: 150, dtype: bool
print(dups.any())
True
print(df[dups])
     sepal_length  sepal_width  petal_length  petal_width         species
34            4.9          3.1           1.5          0.1     Iris-setosa
37            4.9          3.1           1.5          0.1     Iris-setosa
142           5.8          2.7           5.1          1.9  Iris-virginica

Delete Rows That Contain Duplicate Data

df.shape
(150, 5)
df.drop_duplicates(inplace=True)
df.shape
(147, 5)

Outlier Identification and Removal

from numpy.random import randn, seed
from numpy import mean, std

Test Dataset

seed(1)
data = 5 * randn(10000) + 50
print(f'{mean(data): .3f}, ({std(data):.3f})')
 50.049, (4.994)

Standard Deviation Method

data = 5 * randn(10000) + 50
data
array([49.38763047, 51.14084909, 48.23847435, ..., 62.04216899,
       54.41392775, 49.50201845])
data_mean, data_std = mean(data), std(data)
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off
outliers = [x for x in data if x < lower or x > upper]
print(f'Identified outliers: {len(outliers)}')
Identified outliers: 26
outliers_removed = [x for x in data if x >= lower and x <= upper]
print(f'Non-outlier observations: {len(outliers_removed)}')
Non-outlier observations: 9974

Interquartile Range Method

from numpy import percentile
data = 5 * randn(10000) + 50
q25, q75 = percentile(data, 25), percentile(data, 75)
iqr = q75 - q25
print(f'Percentiles: 25th={q25:.3f}, 75th={q75:.3f}, IQR={iqr:.3f}')
Percentiles: 25th=46.567, 75th=53.215, IQR=6.647
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off
outliers = [x for x in data if x < lower or x > upper]
print(f'Identified outliers: {len(outliers)}')
Identified outliers: 75
outliers_removed = [x for x in data if x >= lower and x <= upper]
print(f'Non-outlier observations: {len(outliers_removed)}')
Non-outlier observations: 9925

Automatic Outlier Detection

from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

Predicting without removing outliers

df = read_csv(data_path + 'boston-housing.csv', header=None, delim_whitespace=True)
df
0 1 2 3 4 5 6 7 8 9 10 11 12 13
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273.0 21.0 391.99 9.67 22.4
502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273.0 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273.0 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273.0 21.0 393.45 6.48 22.0
505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273.0 21.0 396.90 7.88 11.9

506 rows × 14 columns

data = df.values
X, y = data[:, :-1], data[:, -1]
X.shape, y.shape
((506, 13), (506,))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
yhat = model.predict(X_test)
mae = mean_absolute_error(y_test, yhat)
print(f'{mae:0.3f}')
3.417

Predicting after removing outliers

from sklearn.neighbors import LocalOutlierFactor
df = read_csv(data_path + 'boston-housing.csv', delim_whitespace=True, header=None)
df
0 1 2 3 4 5 6 7 8 9 10 11 12 13
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273.0 21.0 391.99 9.67 22.4
502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273.0 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273.0 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273.0 21.0 393.45 6.48 22.0
505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273.0 21.0 396.90 7.88 11.9

506 rows × 14 columns

data = df.values
X, y = data[:, :-1], data[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, y_train.shape)
(339, 13) (339,)
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
print(X_train.shape, y_train.shape)
(305, 13) (305,)
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
yhat = model.predict(X_test)
mae = mean_absolute_error(y_test, yhat)
print(f'MAE: {mae:.3f}')
MAE: 3.356

Remove Missing Data

from numpy import nan
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold, cross_val_score

Mark Missing Values

dataset = read_csv(data_path + 'pima-indians-diabetes.csv', header=None)
dataset.head(20)
0 1 2 3 4 5 6 7 8
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
5 5 116 74 0 0 25.6 0.201 30 0
6 3 78 50 32 88 31.0 0.248 26 1
7 10 115 0 0 0 35.3 0.134 29 0
8 2 197 70 45 543 30.5 0.158 53 1
9 8 125 96 0 0 0.0 0.232 54 1
10 4 110 92 0 0 37.6 0.191 30 0
11 10 168 74 0 0 38.0 0.537 34 1
12 10 139 80 0 0 27.1 1.441 57 0
13 1 189 60 23 846 30.1 0.398 59 1
14 5 166 72 19 175 25.8 0.587 51 1
15 7 100 0 0 0 30.0 0.484 32 1
16 0 118 84 47 230 45.8 0.551 31 1
17 7 107 74 0 0 29.6 0.254 31 1
18 1 103 30 38 83 43.3 0.183 33 0
19 1 115 70 30 96 34.6 0.529 32 1
dataset.describe()
0 1 2 3 4 5 6 7 8
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000
num_missing = (dataset[[1,2,3,4,5]] == 0).sum()
print(num_missing)
1      5
2     35
3    227
4    374
5     11
dtype: int64
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)
print(dataset.isnull().sum())
0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64
print(dataset.head(20))
     0      1     2     3      4     5      6   7  8
0    6  148.0  72.0  35.0    NaN  33.6  0.627  50  1
1    1   85.0  66.0  29.0    NaN  26.6  0.351  31  0
2    8  183.0  64.0   NaN    NaN  23.3  0.672  32  1
3    1   89.0  66.0  23.0   94.0  28.1  0.167  21  0
4    0  137.0  40.0  35.0  168.0  43.1  2.288  33  1
5    5  116.0  74.0   NaN    NaN  25.6  0.201  30  0
6    3   78.0  50.0  32.0   88.0  31.0  0.248  26  1
7   10  115.0   NaN   NaN    NaN  35.3  0.134  29  0
8    2  197.0  70.0  45.0  543.0  30.5  0.158  53  1
9    8  125.0  96.0   NaN    NaN   NaN  0.232  54  1
10   4  110.0  92.0   NaN    NaN  37.6  0.191  30  0
11  10  168.0  74.0   NaN    NaN  38.0  0.537  34  1
12  10  139.0  80.0   NaN    NaN  27.1  1.441  57  0
13   1  189.0  60.0  23.0  846.0  30.1  0.398  59  1
14   5  166.0  72.0  19.0  175.0  25.8  0.587  51  1
15   7  100.0   NaN   NaN    NaN  30.0  0.484  32  1
16   0  118.0  84.0  47.0  230.0  45.8  0.551  31  1
17   7  107.0  74.0   NaN    NaN  29.6  0.254  31  1
18   1  103.0  30.0  38.0   83.0  43.3  0.183  33  0
19   1  115.0  70.0  30.0   96.0  34.6  0.529  32  1

Missing Values Cause Problems

dataset = read_csv(data_path + 'pima-indians-diabetes.csv', header=None)
dataset.head(20)
0 1 2 3 4 5 6 7 8
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
5 5 116 74 0 0 25.6 0.201 30 0
6 3 78 50 32 88 31.0 0.248 26 1
7 10 115 0 0 0 35.3 0.134 29 0
8 2 197 70 45 543 30.5 0.158 53 1
9 8 125 96 0 0 0.0 0.232 54 1
10 4 110 92 0 0 37.6 0.191 30 0
11 10 168 74 0 0 38.0 0.537 34 1
12 10 139 80 0 0 27.1 1.441 57 0
13 1 189 60 23 846 30.1 0.398 59 1
14 5 166 72 19 175 25.8 0.587 51 1
15 7 100 0 0 0 30.0 0.484 32 1
16 0 118 84 47 230 45.8 0.551 31 1
17 7 107 74 0 0 29.6 0.254 31 1
18 1 103 30 38 83 43.3 0.183 33 0
19 1 115 70 30 96 34.6 0.529 32 1
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)
values = dataset.values
X = values[:, 0:8]
y = values[:, 8]
model = LinearDiscriminantAnalysis()
cv = KFold(n_splits=3, shuffle=True, random_state=1)
result = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print(f'{result:0.3f}')
ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/home/naji/miniconda2/envs/nbdev/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/naji/miniconda2/envs/nbdev/lib/python3.9/site-packages/sklearn/discriminant_analysis.py", line 550, in fit
    X, y = self._validate_data(
  File "/home/naji/miniconda2/envs/nbdev/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/naji/miniconda2/envs/nbdev/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/home/naji/miniconda2/envs/nbdev/lib/python3.9/site-packages/sklearn/utils/validation.py", line 899, in check_array
    _assert_all_finite(
  File "/home/naji/miniconda2/envs/nbdev/lib/python3.9/site-packages/sklearn/utils/validation.py", line 146, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
LinearDiscriminantAnalysis does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Remove Rows With Missing Values

dataset = read_csv(data_path + 'pima-indians-diabetes.csv', header=None)
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)
dataset.dropna(inplace=True)
values = dataset.values
X = values[:, 0:8]
y = values[:,8]
model = LinearDiscriminantAnalysis()
cv = KFold(n_splits=3, shuffle=True, random_state=1)
result = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print(f'Accuracy:{result.mean():0.3f}')
Accuracy:0.781

Imputation

Horse Colic Dataset

dataframe = read_csv(data_path + 'horse-colic.csv', header=None, na_values='?')
dataframe.head()
0 1 2 3 4 5 6 7 8 9 ... 18 19 20 21 22 23 24 25 26 27
0 2.0 1 530101 38.5 66.0 28.0 3.0 3.0 NaN 2.0 ... 45.0 8.4 NaN NaN 2.0 2 11300 0 0 2
1 1.0 1 534817 39.2 88.0 20.0 NaN NaN 4.0 1.0 ... 50.0 85.0 2.0 2.0 3.0 2 2208 0 0 2
2 2.0 1 530334 38.3 40.0 24.0 1.0 1.0 3.0 1.0 ... 33.0 6.7 NaN NaN 1.0 2 0 0 0 1
3 1.0 9 5290409 39.1 164.0 84.0 4.0 1.0 6.0 2.0 ... 48.0 7.2 3.0 5.3 2.0 1 2208 0 0 1
4 2.0 1 530255 37.3 104.0 35.0 NaN NaN 6.0 2.0 ... 74.0 7.4 NaN NaN 2.0 2 4300 0 0 2

5 rows × 28 columns

for i in range(dataframe.shape[1]):
    n_miss = dataframe[[i]].isnull().sum()
    perc = (n_miss / dataframe.shape[0]) * 100
    print('%d, Missing: %d (%.1f%%) ' % (i, n_miss, perc))
0, Missing: 1 (0.3%) 
1, Missing: 0 (0.0%) 
2, Missing: 0 (0.0%) 
3, Missing: 60 (20.0%) 
4, Missing: 24 (8.0%) 
5, Missing: 58 (19.3%) 
6, Missing: 56 (18.7%) 
7, Missing: 69 (23.0%) 
8, Missing: 47 (15.7%) 
9, Missing: 32 (10.7%) 
10, Missing: 55 (18.3%) 
11, Missing: 44 (14.7%) 
12, Missing: 56 (18.7%) 
13, Missing: 104 (34.7%) 
14, Missing: 106 (35.3%) 
15, Missing: 247 (82.3%) 
16, Missing: 102 (34.0%) 
17, Missing: 118 (39.3%) 
18, Missing: 29 (9.7%) 
19, Missing: 33 (11.0%) 
20, Missing: 165 (55.0%) 
21, Missing: 198 (66.0%) 
22, Missing: 1 (0.3%) 
23, Missing: 0 (0.0%) 
24, Missing: 0 (0.0%) 
25, Missing: 0 (0.0%) 
26, Missing: 0 (0.0%) 
27, Missing: 0 (0.0%) 

Statistical Imputation

Statistical Imputation With SimpleImputer

from numpy import isnan
from sklearn.impute import SimpleImputer
dataframe = read_csv(data_path + 'horse-colic.csv', header=None, na_values='?')
dataframe
0 1 2 3 4 5 6 7 8 9 ... 18 19 20 21 22 23 24 25 26 27
0 2.0 1 530101 38.5 66.0 28.0 3.0 3.0 NaN 2.0 ... 45.0 8.4 NaN NaN 2.0 2 11300 0 0 2
1 1.0 1 534817 39.2 88.0 20.0 NaN NaN 4.0 1.0 ... 50.0 85.0 2.0 2.0 3.0 2 2208 0 0 2
2 2.0 1 530334 38.3 40.0 24.0 1.0 1.0 3.0 1.0 ... 33.0 6.7 NaN NaN 1.0 2 0 0 0 1
3 1.0 9 5290409 39.1 164.0 84.0 4.0 1.0 6.0 2.0 ... 48.0 7.2 3.0 5.3 2.0 1 2208 0 0 1
4 2.0 1 530255 37.3 104.0 35.0 NaN NaN 6.0 2.0 ... 74.0 7.4 NaN NaN 2.0 2 4300 0 0 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
295 1.0 1 533886 NaN 120.0 70.0 4.0 NaN 4.0 2.0 ... 55.0 65.0 NaN NaN 3.0 2 3205 0 0 2
296 2.0 1 527702 37.2 72.0 24.0 3.0 2.0 4.0 2.0 ... 44.0 NaN 3.0 3.3 3.0 1 2208 0 0 1
297 1.0 1 529386 37.5 72.0 30.0 4.0 3.0 4.0 1.0 ... 60.0 6.8 NaN NaN 2.0 1 3205 0 0 2
298 1.0 1 530612 36.5 100.0 24.0 3.0 3.0 3.0 1.0 ... 50.0 6.0 3.0 3.4 1.0 1 2208 0 0 1
299 1.0 1 534618 37.2 40.0 20.0 NaN NaN NaN NaN ... 36.0 62.0 1.0 1.0 3.0 2 6112 0 0 2

300 rows × 28 columns

data = dataframe.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
print(f'Missing: {sum(isnan(X).flatten())}')
Missing: 1605
imputer = SimpleImputer(strategy='mean')
imputer.fit(X)
SimpleImputer()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Xtrans = imputer.transform(X)
print(f'Missing: {sum(isnan(Xtrans).flatten())}')
Missing: 0

KNN Imputation

Nearest Neighbor Imputation with KNNImputer

from numpy import isnan
from sklearn.impute import KNNImputer
data = dataframe.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
print(f'Missing: {sum(isnan(X).flatten())}')
Missing: 1605
imputer = KNNImputer()
imputer.fit(X)
KNNImputer()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Xtrans = imputer.transform(X)
print(f'Missing: {sum(isnan(Xtrans).flatten())}')
Missing: 0

KNNImputer and Model Evaluation

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
df = read_csv(data_path + 'horse-colic.csv', header=None, na_values='?')
data = df.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
model = RandomForestClassifier()
imputer = KNNImputer()
pipeline = Pipeline(steps=[('i', imputer), ('m', model)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)
print(f'{scores.mean():.3f} ({scores.std():.3f})')
0.862 (0.048)

KNNImputer and Different Number of Neighbors

from matplotlib import pyplot
df = read_csv(data_path + 'horse-colic.csv', header=None, na_values='?')
data = df.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
results = list()
strategies = [str(i) for i in [1,3,5,7,9,15,18,21]]
for s in strategies:
    pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=int(s))), ('m', RandomForestClassifier())])
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)
    results.append(scores)
    print(f'{s} {scores.mean():.3f} ({scores.std():0.3f})')
pyplot.boxplot(results, labels=strategies, showmeans=True)
pyplot.show()

KNNImputer Transform When Making a Prediction

from numpy import nan
df = read_csv(data_path + 'horse-colic.csv', header=None, na_values='?')
data = df.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
pipeline = Pipeline(steps=[('i', KNNImputer(n_neighbors=21)), ('m', RandomForestClassifier())])
pipeline.fit(X, y)
row = [2, 1, 530101, 38.50, 66, 28, 3, 3, nan, 2, 5, 4, 4, nan, nan, nan, 3, 5, 45.00,
8.40, nan, nan, 2, 11300, 00000, 00000, 2]
yhat = pipeline.predict([row])
print(f'Predicted Class: {yhat[0]}')

Iterative Imputation

from numpy import isnan
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

Iterative Imputation With IterativeImputer

IterativeImputer Data Transform

df = read_csv(data_path + 'horse-colic.csv', header = None, na_values='?')
data = df.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
print(f'Missing: {sum(isnan(X).flatten())}')
imputer = IterativeImputer()
imputer.fit(X)
Xtrans = imputer.transform(X)
print(f'Missing: {sum(isnan(Xtrans).flatten())}')

IterativeImputer and Model Evaluation

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
df = read_csv(data_path + 'horse-colic.csv', header=None, na_values='?')
data = df.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
model = RandomForestClassifier()
imputer = IterativeImputer()
pipeline = Pipeline(steps=[('i', imputer), ('m', model)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)
print(f'{scores.mean():.3f} ({scores.std():.3f})')

IterativeImputer and Different Imputation Order

from matplotlib import pyplot
dataframe = read_csv(data_path+'horse-colic.csv', header=None, na_values= '?' )

data = dataframe.values
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
results = list()
strategies = ['ascending', 'descending', 'roman', 'arabic', 'random']
for s in strategies:
    pipeline = Pipeline(steps=[('i', IterativeImputer(imputation_order=s)), ('m', RandomForestClassifier())])
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)
    results.append(scores)
    print(f'{s}: {scores.mean():.3f} ({scores.std():.3f})')
pyplot.boxplot(results, showmeans=True, labels=strategies)
pyplot.show()

IterativeImputer and Different Number of Iterations

IterativeImputer Transform When Making a Prediction