from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
Data Leakage
Data Preparation With Train and Test Sets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
Train-Test Evaluation With Naive Data Preparation
= make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7) X, y
# standardize the dataset
= MinMaxScaler()
scaler = scaler.fit_transform(X) X
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= LogisticRegression()
model model.fit(X_train, y_train)
= model.predict(X_test) yhat
= accuracy_score(y_test, yhat) accuracy
print(f'{accuracy*100: 0.3f}%')
Train-Test Evaluation With Correct Data Preparation
= make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7) X, y
= train_test_split(X, y, test_size=0.33, random_state=1) X_train, X_test, y_train, y_test
= MinMaxScaler() scaler
scaler.fit(X_train)
= scaler.transform(X_train)
X_train = scaler.transform(X_test) X_test
= LogisticRegression()
model model.fit(X_train, y_train)
= model.predict(X_test) yhat
= accuracy_score(y_test, yhat)
accuracy print(f'{accuracy*100: .3f}%')
Data Preparation With k-fold Cross-Validation
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
Cross-Validation Evaluation With Naive Data Preparation
= make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7) X, y
= MinMaxScaler() scaler
= scaler.fit_transform(X) X
= LogisticRegression() model
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(model, X, y, scoring='accuracy', cv=cv) scores
print(f'{scores.mean()*100: .3f} ({scores.std()*100: .3f})')
85.300 ( 3.607)
Cross-Validation Evaluation With Correct Data Preparation
= make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7) X, y
= list()
steps 'scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression())) steps.append((
= Pipeline(steps=steps) pipeline
= RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv
= cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv) scores
print(f'{scores.mean()*100: .3f} ({scores.std()*100:.3f})')
85.433 (3.471)