Consistent one-hot-encoding (#38)

* Reuse code between generalize and transform methods

* Option to get encoder from user

* Consistent encoding for decision tree and generalizations (separate from target model encoding)
This commit is contained in:
abigailgold 2022-05-22 18:02:33 +03:00 committed by GitHub
parent 7055d5ecf6
commit dfa684da6b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 153 additions and 128 deletions

View file

@ -74,8 +74,8 @@ def test_minimizer_fit(data):
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
train_dataset = ArrayDataset(X, predictions, features_names=features)
gen.fit(dataset=train_dataset)
@ -102,6 +102,9 @@ def test_minimizer_fit(data):
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_fit_pandas(data):
features = ['age', 'height', 'sex', 'ola']
@ -145,7 +148,8 @@ def test_minimizer_fit_pandas(data):
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
@ -169,6 +173,9 @@ def test_minimizer_fit_pandas(data):
assert (ncp > 0)
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_params_categorical(data):
# Assume three features, age, sex and height, and boolean label
@ -226,12 +233,16 @@ def test_minimizer_params_categorical(data):
predictions = np.argmax(predictions, axis=1)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, cells=cells)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_fit_QI(data):
features = ['age', 'height', 'weight']
@ -257,8 +268,8 @@ def test_minimizer_fit_QI(data):
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions, features_names=features)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ad)
@ -284,6 +295,9 @@ def test_minimizer_fit_QI(data):
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_fit_pandas_QI(data):
features = ['age', 'height', 'weight', 'sex', 'ola']
@ -329,7 +343,8 @@ def test_minimizer_fit_pandas_QI(data):
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
@ -356,6 +371,9 @@ def test_minimizer_fit_pandas_QI(data):
assert (ncp > 0)
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimize_ndarray_iris():
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
@ -368,8 +386,8 @@ def test_minimize_ndarray_iris():
predictions = model.predict(ArrayDataset(x_train))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features_to_minimize=QI)
target_accuracy = 0.3
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
# gen.fit(dataset=ArrayDataset(x_train, predictions))
transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
gener = gen.generalizations
@ -397,6 +415,9 @@ def test_minimize_ndarray_iris():
assert (ncp > 0)
assert (((transformed[indexes]) != (x_train[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimize_pandas_adult():
(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()
@ -433,8 +454,8 @@ def test_minimize_pandas_adult():
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train))
@ -472,6 +493,9 @@ def test_minimize_pandas_adult():
assert (ncp > 0)
assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_german_credit_pandas():
(x_train, y_train), (x_test, y_test) = get_german_credit_dataset_pd()
@ -506,8 +530,8 @@ def test_german_credit_pandas():
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions))
transformed = gen.transform(dataset=ArrayDataset(x_train))
@ -545,6 +569,9 @@ def test_german_credit_pandas():
assert (ncp > 0)
assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_regression():
dataset = load_diabetes()
@ -558,7 +585,8 @@ def test_regression():
features = ['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6']
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, is_regression=True,
features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
@ -615,6 +643,9 @@ def test_regression():
assert (ncp > 0)
assert (((transformed[indexes]) != (x_train[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_X_y(data):
features = [0, 1, 2]
@ -640,8 +671,8 @@ def test_X_y(data):
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
gen.fit(X=X, y=predictions)
transformed = gen.transform(X)
gener = gen.generalizations
@ -666,6 +697,9 @@ def test_X_y(data):
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_X_y_features_names(data):
features = ['age', 'height', 'weight']
@ -691,8 +725,8 @@ def test_X_y_features_names(data):
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
gen.fit(X=X, y=predictions, features_names=features)
transformed = gen.transform(X=X, features_names=features)
gener = gen.generalizations
@ -717,6 +751,9 @@ def test_X_y_features_names(data):
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_BaseEstimator_classification(data):
features = ['age', 'height', 'weight', 'sex', 'ola']
@ -760,7 +797,8 @@ def test_BaseEstimator_classification(data):
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
@ -787,6 +825,9 @@ def test_BaseEstimator_classification(data):
assert (ncp > 0)
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_BaseEstimator_regression():
dataset = load_diabetes()
@ -799,8 +840,8 @@ def test_BaseEstimator_regression():
QI = ['age', 'bmi', 's2', 's5']
features = ['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6']
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, is_regression=True,
features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
@ -857,6 +898,9 @@ def test_BaseEstimator_regression():
assert (ncp > 0)
assert (((transformed[indexes]) != (x_train[indexes])).any())
rel_accuracy = model.score(transformed, predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_keras_model():
(X, y), (x_test, y_test) = get_iris_dataset_np()
@ -874,8 +918,8 @@ def test_keras_model():
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
test_dataset = ArrayDataset(x_test, predictions)
gen.fit(dataset=test_dataset)
@ -895,6 +939,9 @@ def test_keras_model():
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_blackbox_model():
(X, y), (x_test, y_test) = get_iris_dataset_np()
@ -907,8 +954,8 @@ def test_blackbox_model():
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
train_dataset = ArrayDataset(x_test, predictions)
gen.fit(dataset=train_dataset)
@ -939,6 +986,9 @@ def test_blackbox_model():
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_untouched():
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}}, "label": 0,