from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, \
confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier29 Imbalanced Data (Titanic Dataset)
When working with imbalanced data, we have a couple of options available to us.
For example, we can - create additional synthetic data - use features built in to models to affect how much weight it gives samples from the minority class
Let’s import our titanic dataset as before.
try:
data = pd.read_csv("data/processed_data.csv")
except FileNotFoundError:
# Download processed data:
address = 'https://raw.githubusercontent.com/MichaelAllen1966/' + \
'1804_python_healthcare/master/titanic/data/processed_data.csv'
data = pd.read_csv(address)
# Create a data subfolder if one does not already exist
import os
data_directory ='./data/'
if not os.path.exists(data_directory):
os.makedirs(data_directory)
# Save data
data.to_csv(data_directory + 'processed_data.csv', index=False)
data = data.astype(float)
# Drop Passengerid (axis=1 indicates we are removing a column rather than a row)
# We drop passenger ID as it is not original data
data.drop('PassengerId', inplace=True, axis=1)
X = data.drop('Survived',axis=1) # X = all 'data' except the 'survived' column
y = data['Survived'] # y = 'survived' column from 'data'
feature_names = X.columns.tolist()
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_validate, y_train, y_validate = train_test_split(X_train_val, y_train_val,
test_size=0.2, random_state=42)
print(f"Training Dataset Samples: {len(X_train)}")
print(f"Validation Dataset Samples: {len(X_validate)}")
print(f"Testing Dataset Samples: {len(X_test)}")Training Dataset Samples: 569
Validation Dataset Samples: 143
Testing Dataset Samples: 179
# Let's also check that the class splits are as expected
number_positive_class = np.sum(data['Survived'] == 1)
number_negative_class = np.sum(data['Survived'] == 0)
print (f"Positives : {number_positive_class}")
print (f"Negatives : {number_negative_class}")Positives : 342
Negatives : 549
We’ll also create a function for comparing our outputs.
def fit_train(name="Logistic Regression",
X_train=X_train, X_validate=X_validate,
y_train=y_train, y_validate=y_validate,
model=LogisticRegression()
):
model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_validate)
tn, fp, fn, tp = confusion_matrix(y_validate, y_pred_val, labels=[0, 1]).ravel()
return pd.DataFrame({
'Accuracy (training)': np.mean(y_pred_train == y_train),
'Accuracy (validation)': np.mean(y_pred_val == y_validate),
'Precision (validation)': precision_score(y_validate, y_pred_val, average='macro'),
'Recall (validation)': recall_score(y_validate, y_pred_val, average='macro'),
"AUC": roc_auc_score(y_validate, y_pred_val),
"f1": f1_score(y_validate, y_pred_val, average='macro'),
"FP": fp,
"FN": fn
}, index=[name]
).round(3)29.1 In-model options
Certain models have options to allow us to account for imbalanced data automatically.
results_df = fit_train("Standard", model=XGBClassifier())
results_df| Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
|---|---|---|---|---|---|---|---|---|
| Standard | 0.979 | 0.797 | 0.788 | 0.786 | 0.786 | 0.787 | 14 | 15 |
model = XGBClassifier(
random_state=42,
scale_pos_weight=number_negative_class/number_positive_class
)results_df = pd.concat([results_df,
fit_train("With Imbalanced Data Parameter", model=model)]
)
results_df| Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
|---|---|---|---|---|---|---|---|---|
| Standard | 0.979 | 0.797 | 0.788 | 0.786 | 0.786 | 0.787 | 14 | 15 |
| With Imbalanced Data Parameter | 0.988 | 0.811 | 0.802 | 0.800 | 0.800 | 0.801 | 13 | 14 |
29.1.1 Logistic Regression
def standardise_data(X_train, X_test):
# Initialise a new scaling object for normalising input data
sc = StandardScaler()
# Set up the scaler just on the training set
sc.fit(X_train)
# Apply the scaler to the training and test sets
train_std=sc.transform(X_train)
test_std=sc.transform(X_test)
return train_std, test_stdX_train_std, X_test_std = standardise_data(X_train, X_test)results_df = fit_train("Logistic Regression", model=LogisticRegression(),
X_train=X_train_std,
X_validate=X_test_std,
y_train=y_train,
y_validate=y_test)
results_df| Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
|---|---|---|---|---|---|---|---|---|
| Logistic Regression | 0.8 | 0.832 | 0.827 | 0.827 | 0.827 | 0.827 | 15 | 15 |
According to the documentation, “The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).”
What will this be in our case?
len(X_train_std) / (2 * np.bincount(y_train))array([0.79691877, 1.34198113])
model_lr = LogisticRegression(
class_weight="balanced"
)
results_df = pd.concat([results_df,
fit_train("With Imbalanced Data Parameter", model=model_lr,
X_train=X_train_std,
X_validate=X_test_std,
y_train=y_train,
y_validate=y_test)
]
)
results_df| Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
|---|---|---|---|---|---|---|---|---|
| Logistic Regression | 0.800 | 0.832 | 0.827 | 0.827 | 0.827 | 0.827 | 15 | 15 |
| With Imbalanced Data Parameter | 0.803 | 0.804 | 0.800 | 0.807 | 0.807 | 0.801 | 22 | 13 |
In our case, this doesn’t seem to have helped - but it’s good to know and interesting to see the strong impact on false positives and false negatives. If we were interested in maximising some aspect of our model, the slight loss in overall performance may feel worthwhile to us.
29.1.2 Upsampling using synthetic data generators like SMOTE
def make_synthetic_data_smote(X, y, number_of_samples=[1000,1000]):
"""
Synthetic data generation for two classes.
Inputs
------
original_data: X, y numpy arrays (y should have label 0 and 1)
number_of_samples: number of samples to generate (list for y=0, y=1)
(Note - number_of_samples has default of 1000 samples for each class
if no numbers are specified at the point of calling the function)
Returns
-------
X_synthetic: NumPy array
y_synthetic: NumPy array
"""
# Count instances in each class
count_label_0 = np.sum(y==0)
count_label_1 = np.sum(y==1)
# SMOTE requires final class counts; add current counts to required counts
# (which are passed into the function)
n_class_0 = number_of_samples[0] + count_label_0
n_class_1 = number_of_samples[1] + count_label_1
# Use SMOTE to sample data points. The number of points that we pass over
# to SMOTE is calculated above (the number of synthetic data samples we
# want, which we passed into the function + the counts from the original
# data). This tells SMOTE how many TOTAL data points are needed (original
# + synthetic) for each class. It then uses the original data to generate
# new synthetic data points.
# For example, imagine our original data has 100 samples for class 0 and 50
# for class 1, and we tell SMOTE we want 100 synthetic data points for
# class 0 and 150 synthetic data points for class 1. We tell SMOTE that we
# need a total of 200 data points for class 0 (100 original + 100 synthetic)
# and 200 data points for class 1 (50 original + 150 synthetic). It will
# then fill those data points by taking the original data (which will fill
# up the first 100 "slots" for class 0, and the first 50 "slots" for class 1)
# and then use these original data points to sample new synthetic data points
# to fill the remaining "slots" in each class.
X_resampled, y_resampled = SMOTE(
sampling_strategy = {0:n_class_0, 1:n_class_1}).fit_resample(X, y)
# Get just the additional (synthetic) data points. By using len(X) for the
# X (input feature) data, and len(y) for the y (output label) data, we skip
# the original data, and just start from the newly created synthetic data,
# generated by SMOTE (above)
X_synthetic = X_resampled[len(X):]
y_synthetic = y_resampled[len(y):]
return X_synthetic, y_syntheticX.head()| Pclass | Age | SibSp | Parch | Fare | AgeImputed | EmbarkedImputed | CabinLetterImputed | CabinNumber | CabinNumberImputed | ... | Embarked_missing | CabinLetter_A | CabinLetter_B | CabinLetter_C | CabinLetter_D | CabinLetter_E | CabinLetter_F | CabinLetter_G | CabinLetter_T | CabinLetter_missing | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3.0 | 22.0 | 1.0 | 0.0 | 7.2500 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 1.0 | 38.0 | 1.0 | 0.0 | 71.2833 | 0.0 | 0.0 | 0.0 | 85.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 3.0 | 26.0 | 0.0 | 0.0 | 7.9250 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 3 | 1.0 | 35.0 | 1.0 | 0.0 | 53.1000 | 0.0 | 0.0 | 0.0 | 123.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 3.0 | 35.0 | 0.0 | 0.0 | 8.0500 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
5 rows × 24 columns
# Get full list of column names (the names of our features)
X_col_names = X.columns.tolist()
# Set categorical one-hots cols using common prefix
categorical = ['CabinLetter_', 'Embarked_']
one_hot_cols = []
for col in categorical:
one_hot_cols.append([x for x in X_col_names if x[0:len(col)] == col])
# Set integer columns
integer_cols = ['Age', 'Pclass']
# Don't need to explicitly set float cols
# Set binary columns
binary_cols = ['SibSp', 'Parch', 'AgeImputed', 'EmbarkedImputed']# Generate synthetic data again, but this time with 250 extra synthetic data
# points for the positive class (double what we need), and 0 for the negative
# class
X_synthetic, y_synthetic = make_synthetic_data_smote(
X_train, y_train, number_of_samples=[0, 350]
)
# Set y_label
y_label = "Survived"
# Create a data frame with id to store the synthetic data
synth_df = pd.DataFrame()
# Transfer X values to the new DataFrame
synth_df=pd.concat([synth_df,
pd.DataFrame(X_synthetic, columns=X.columns.to_list())],
axis=1)synth_df| Pclass | Age | SibSp | Parch | Fare | AgeImputed | EmbarkedImputed | CabinLetterImputed | CabinNumber | CabinNumberImputed | ... | Embarked_missing | CabinLetter_A | CabinLetter_B | CabinLetter_C | CabinLetter_D | CabinLetter_E | CabinLetter_F | CabinLetter_G | CabinLetter_T | CabinLetter_missing | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 569 | 1.000000 | 34.674002 | 1.000000 | 0.000000 | 90.000000 | 0.0 | 0.0 | 0.0 | 90.555015 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 570 | 1.000000 | 35.938509 | 0.000000 | 0.000000 | 223.817179 | 0.0 | 0.0 | 0.0 | 35.837817 | 0.0 | ... | 0.0 | 0.0 | 0.229055 | 0.770945 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 571 | 1.000000 | 55.300072 | 0.000000 | 0.850004 | 153.462500 | 0.0 | 0.0 | 0.0 | 125.000000 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 1.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 572 | 1.000000 | 22.943377 | 0.000000 | 0.528311 | 61.745149 | 0.0 | 0.0 | 0.0 | 33.943377 | 0.0 | ... | 0.0 | 0.0 | 0.471689 | 0.000000 | 0.0 | 0.528311 | 0.0 | 0.0 | 0.0 | 0.0 |
| 573 | 1.025236 | 27.050471 | 0.025236 | 0.000000 | 30.386439 | 0.0 | 0.0 | 1.0 | 0.000000 | 1.0 | ... | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 914 | 2.000000 | 25.995775 | 1.000000 | 0.751056 | 29.004225 | 0.0 | 0.0 | 1.0 | 0.000000 | 1.0 | ... | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.0 |
| 915 | 2.945161 | 62.945161 | 0.000000 | 0.000000 | 9.637540 | 0.0 | 0.0 | 1.0 | 0.000000 | 1.0 | ... | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.0 |
| 916 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 7.750000 | 1.0 | 0.0 | 1.0 | 0.000000 | 1.0 | ... | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.0 |
| 917 | 1.773067 | 33.067333 | 0.226933 | 0.000000 | 22.100026 | 0.0 | 0.0 | 0.0 | 105.992533 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.226933 | 0.0 | 0.773067 | 0.0 | 0.0 | 0.0 | 0.0 |
| 918 | 3.000000 | 15.809182 | 0.000000 | 0.000000 | 7.892575 | 0.0 | 0.0 | 1.0 | 0.000000 | 1.0 | ... | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 1.0 |
350 rows × 24 columns
# Make integer as necessary by rounding the raw synthetic data
for col in integer_cols:
synth_df[col] = synth_df[col].round(0)
# Round binary cols and clip so values under 0 or above 1
# are set to 0 and 1 respectively (this won't happen with
# SMOTE, as it will only sample between the two points (so
# points sampled between binary points will always be
# between 0 and 1) but it can happen with other methods)
for col in binary_cols:
synth_df[col] = np.clip(synth_df[col],0,1).round(0)
# Add y data with a label
y_list = list(y_synthetic)
synth_df[y_label] = y_list
# Shuffle data
synth_df = synth_df.sample(frac=1.0)# Standardise synthetic data (based on real training data)
X_train_std, X_synth_std = standardise_data(X_train, X_synthetic)
# Get ALL real X data (combine standardised training + test data)
# We do this because we need to check for duplicates / very close
# values in all of the real data we've got
X_real_std = np.concatenate([X_train_std, X_test_std], axis=0)
# Use SciKitLearn neighbors.NearestNeighbors to find nearest neighbour
# to each data point. First, we fit to the real standardised data
# (all of it, train + test set). Then we can give it the synthetic data
# and ask it to give us the cartesian distance and ID of its nearest
# real world data point neighbour for each synthetic data point.
nn = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(X_real_std)
dists, idxs = nn.kneighbors(X_synth_std)
# Store the index and ids (indices) in the synthetic data DataFrame
# Flatten just reduces something in more than 1 dimension down to
# 1 dimension (eg a list of lists becomes a single list)
synth_df['distance_to_closest_real'] = list(dists.flatten())
synth_df['closest_X_real_row_index'] = list(idxs.flatten())# Get points with zero distance to real (use distance of <0.001 as effectively identical)
identical = synth_df['distance_to_closest_real'] < 0.001
print (f'Proportion of data points identical to real data points = {identical.mean():0.3f}')
# Remove points with zero (or effectively zero) distance to a real data point. We
# do this by setting up a mask that says we only want to see data points where the "identical"
# criterion we specified above is false (ie they're not identical). Then we apply that
# mask and overwrite our existing synthetic data DataFrame so we've now only got data points
# that are not identical to real world data points.
mask = identical == False
synth_df = synth_df[mask]Proportion of data points identical to real data points = 0.074
# Proportion of points to remove
proportion_to_remove = 0.1
# Sort by distance, with highest distances (those we want to keep) at
# the top
synth_by_distance = synth_df.sort_values(
'distance_to_closest_real', ascending=False)
# Limit data. Calculate the number of entries to keep as being the
# total number of synthetic data points we've now got (after having
# removed ones identical to real world data points) multiplied by
# the proportion we want to keep (the inverse of the proportion to remove).
# As we've sorted in descending order by distance, we can then just
# use .head to identify how much of the top of list we want to keep
# (90% in this case, where we're removing the 10% that are closest - at
# the bottom)
number_to_keep = int(len(synth_by_distance) * (1 - proportion_to_remove))
synth_by_distance = synth_by_distance.head(number_to_keep)
# Shuffle and store back in synth_df (frac=1 gives us a sample size of 100%
# (ie - all of the ones we said above we wanted to keep))
synth_df = synth_by_distance.sample(frac=1)# Keep only a random sample of 150 of the remaining synthetic datapoints
# We don't need a mask here as ALL our synthetic datapoints are for class 1
# (positive).
synth_df = synth_df.sample(150)
synth_df| Pclass | Age | SibSp | Parch | Fare | AgeImputed | EmbarkedImputed | CabinLetterImputed | CabinNumber | CabinNumberImputed | ... | CabinLetter_C | CabinLetter_D | CabinLetter_E | CabinLetter_F | CabinLetter_G | CabinLetter_T | CabinLetter_missing | Survived | distance_to_closest_real | closest_X_real_row_index | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 812 | 1.0 | 40.0 | 0.0 | 0.0 | 79.770103 | 0.0 | 1.0 | 0.000000 | 26.938916 | 0.000000 | ... | 0.000000 | 0.132635 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 3.069914 | 214 |
| 634 | 2.0 | 32.0 | 1.0 | 1.0 | 26.175431 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 0.328739 | 550 |
| 720 | 3.0 | 2.0 | 0.0 | 1.0 | 12.870792 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 0.156506 | 266 |
| 886 | 1.0 | 41.0 | 0.0 | 0.0 | 79.085888 | 0.0 | 1.0 | 0.000000 | 29.397383 | 0.000000 | ... | 0.000000 | 0.279477 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.645736 | 101 |
| 881 | 1.0 | 11.0 | 1.0 | 1.0 | 147.110861 | 0.0 | 0.0 | 0.000000 | 24.788982 | 0.000000 | ... | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 3.136116 | 372 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 635 | 2.0 | 1.0 | 1.0 | 1.0 | 18.613431 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 0.311295 | 639 |
| 789 | 1.0 | 13.0 | 0.0 | 1.0 | 79.786677 | 0.0 | 0.0 | 0.000000 | 20.720365 | 0.000000 | ... | 0.000000 | 0.531185 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 1.0 | 0.378668 | 429 |
| 702 | 2.0 | 24.0 | 1.0 | 1.0 | 65.049929 | 0.0 | 0.0 | 0.968794 | 0.062412 | 0.968794 | ... | 0.031206 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.968794 | 1.0 | 0.565451 | 282 |
| 755 | 2.0 | 31.0 | 0.0 | 0.0 | 13.000000 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 0.314695 | 301 |
| 890 | 3.0 | 4.0 | 0.0 | 1.0 | 22.490998 | 0.0 | 0.0 | 1.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 1.0 | 0.677551 | 160 |
150 rows × 27 columns
# Add synthetic data for positive class (class 1) to real data
# We'll make a separate copy of the original dataframe with the new synthetic
# data points added, keeping our original data intact.
augmented_data = pd.concat([data, synth_df])
# We'll also get rid of the two columns we added -
# distance_to_closest_real and closest_X_real_row_index as we do not want these
# to be used in a Logistic Regression model.
augmented_data.drop('distance_to_closest_real', axis=1, inplace=True)
augmented_data.drop('closest_X_real_row_index', axis=1, inplace=True)# Let's have a look at our new dataframe
augmented_data| Survived | Pclass | Age | SibSp | Parch | Fare | AgeImputed | EmbarkedImputed | CabinLetterImputed | CabinNumber | ... | Embarked_missing | CabinLetter_A | CabinLetter_B | CabinLetter_C | CabinLetter_D | CabinLetter_E | CabinLetter_F | CabinLetter_G | CabinLetter_T | CabinLetter_missing | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 3.0 | 22.0 | 1.0 | 0.0 | 7.250000 | 0.0 | 0.0 | 1.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 |
| 1 | 1.0 | 1.0 | 38.0 | 1.0 | 0.0 | 71.283300 | 0.0 | 0.0 | 0.000000 | 85.000000 | ... | 0.0 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
| 2 | 1.0 | 3.0 | 26.0 | 0.0 | 0.0 | 7.925000 | 0.0 | 0.0 | 1.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 |
| 3 | 1.0 | 1.0 | 35.0 | 1.0 | 0.0 | 53.100000 | 0.0 | 0.0 | 0.000000 | 123.000000 | ... | 0.0 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
| 4 | 0.0 | 3.0 | 35.0 | 0.0 | 0.0 | 8.050000 | 0.0 | 0.0 | 1.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 635 | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 | 18.613431 | 0.0 | 0.0 | 1.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 |
| 789 | 1.0 | 1.0 | 13.0 | 0.0 | 1.0 | 79.786677 | 0.0 | 0.0 | 0.000000 | 20.720365 | ... | 0.0 | 0.468815 | 0.0 | 0.000000 | 0.531185 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
| 702 | 1.0 | 2.0 | 24.0 | 1.0 | 1.0 | 65.049929 | 0.0 | 0.0 | 0.968794 | 0.062412 | ... | 0.0 | 0.000000 | 0.0 | 0.031206 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.968794 |
| 755 | 1.0 | 2.0 | 31.0 | 0.0 | 0.0 | 13.000000 | 0.0 | 0.0 | 1.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 |
| 890 | 1.0 | 3.0 | 4.0 | 0.0 | 1.0 | 22.490998 | 0.0 | 0.0 | 1.000000 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 |
1041 rows × 25 columns
# Let's also check that the class splits are as expected
number_positive_class = np.sum(augmented_data['Survived'] == 1)
number_negative_class = np.sum(augmented_data['Survived'] == 0)
print (f"Positives : {number_positive_class}")
print (f"Negatives : {number_negative_class}")Positives : 492
Negatives : 549
X_aug = augmented_data.drop('Survived',axis=1) # X = all 'data' except the 'stroke' column
y_aug = augmented_data['Survived'] # y = 'stroke' column from 'data'
X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_aug, y_aug, test_size = 0.25, random_state=42)
X_train_std_aug, X_test_std_aug = standardise_data(X_train_aug, X_test_aug)30 Compare performance
results_df = fit_train(X_train=X_train_std_aug, X_validate=X_test_std_aug,
y_train=y_train_aug, y_validate=y_test_aug,
name="With Synthetic Data", )pd.concat([
results_df,
fit_train(X_train=X_train_std, X_validate=X_test_std,
y_train=y_train, y_validate=y_test,
name="Without Synthetic Data")])| Accuracy (training) | Accuracy (validation) | Precision (validation) | Recall (validation) | AUC | f1 | FP | FN | |
|---|---|---|---|---|---|---|---|---|
| With Synthetic Data | 0.8 | 0.793 | 0.795 | 0.791 | 0.791 | 0.792 | 22 | 32 |
| Without Synthetic Data | 0.8 | 0.832 | 0.827 | 0.827 | 0.827 | 0.827 | 15 | 15 |