Support 1-hot encoded features in anonymization + fixes related to encoding in minimization (#86)

* Support 1-hot encoded features in anonymization (#72) * Fix anonymization adult notebook + new notebook to demonstrate anonymization on 1-hot encoded data * Minimizer: No default encoder, if none provided data is supplied to the model as is. Fix data type of representative values. Fix and add more tests. Signed-off-by: abigailt <abigailt@il.ibm.com>
2026-06-29 15:59:38 +02:00 · 2023-10-19 11:48:15 +03:00 · 2023-10-19 11:48:15 +03:00 · 5dce961092
commit 5dce961092
parent 26addd192f
7 changed files with 670 additions and 255 deletions
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@ -1,5 +1,6 @@
 import pytest
 import numpy as np
+import pandas as pd
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
@ -118,6 +119,74 @@ def test_regression():
    assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())


+def test_anonymize_ndarray_one_hot():
+    x_train = np.array([[23, 0, 1, 165],
+                        [45, 0, 1, 158],
+                        [56, 1, 0, 123],
+                        [67, 0, 1, 154],
+                        [45, 1, 0, 149],
+                        [42, 1, 0, 166],
+                        [73, 0, 1, 172],
+                        [94, 0, 1, 168],
+                        [69, 0, 1, 175],
+                        [24, 1, 0, 181],
+                        [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    pred = model.predict(x_train)
+
+    k = 10
+    QI = [0, 1, 2]
+    QI_slices = [[1, 2]]
+    anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
+    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
+    assert (len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
+    _, counts_elements = np.unique(anon[:, QI], return_counts=True)
+    assert (np.min(counts_elements) >= k)
+    assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
+    anonymized_slice = anon[:, QI_slices[0]]
+    assert ((np.sum(anonymized_slice, axis=1) == 1).all())
+    assert ((np.max(anonymized_slice, axis=1) == 1).all())
+    assert ((np.min(anonymized_slice, axis=1) == 0).all())
+
+
+def test_anonymize_pandas_one_hot():
+    feature_names = ["age", "gender_M", "gender_F", "height"]
+    x_train = np.array([[23, 0, 1, 165],
+                        [45, 0, 1, 158],
+                        [56, 1, 0, 123],
+                        [67, 0, 1, 154],
+                        [45, 1, 0, 149],
+                        [42, 1, 0, 166],
+                        [73, 0, 1, 172],
+                        [94, 0, 1, 168],
+                        [69, 0, 1, 175],
+                        [24, 1, 0, 181],
+                        [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+    x_train = pd.DataFrame(x_train, columns=feature_names)
+    y_train = pd.Series(y_train)
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    pred = model.predict(x_train)
+
+    k = 10
+    QI = ["age", "gender_M", "gender_F"]
+    QI_slices = [["gender_M", "gender_F"]]
+    anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
+    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
+    assert (anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
+    assert (anon.loc[:, QI].value_counts().min() >= k)
+    np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
+    anonymized_slice = anon.loc[:, QI_slices[0]]
+    assert ((np.sum(anonymized_slice, axis=1) == 1).all())
+    assert ((np.max(anonymized_slice, axis=1) == 1).all())
+    assert ((np.min(anonymized_slice, axis=1) == 0).all())
+
+
 def test_errors():
    with pytest.raises(ValueError):
        Anonymize(1, [0, 2])
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@ -11,6 +11,8 @@ from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder

+from torch import nn, optim
+
 import tensorflow as tf
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Input
@ -24,6 +26,9 @@ from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegresso
 tf.compat.v1.disable_eager_execution()


+ACCURACY_DIFF = 0.05
+
+
@pytest.fixture
 def diabetes_dataset():
    return load_diabetes()
@ -286,7 +291,7 @@ def test_minimizer_fit(data_two_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimizer_ncp(data_two_features):
@ -348,12 +353,15 @@ def test_minimizer_ncp_categorical(data_four_features):
    train_dataset = ArrayDataset(x, predictions, features_names=features)

    gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                      categorical_features=categorical_features, generalize_using_transform=False)
+                                      categorical_features=categorical_features,
+                                      generalize_using_transform=False,
+                                      encoder=preprocessor)
    gen1.fit(dataset=train_dataset)
    ncp1 = gen1.ncp.fit_score
    ncp2 = gen1.calculate_ncp(ad1)

-    gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features)
+    gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features,
+                                      encoder=preprocessor)
    gen2.fit(dataset=train_dataset)
    ncp3 = gen2.ncp.fit_score
    gen2.transform(dataset=ad1)
@ -414,7 +422,8 @@ def test_minimizer_fit_pandas(data_four_features):
    # Now we have a full prediction pipeline.
    target_accuracy = 0.5
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features)
+                                     categorical_features=categorical_features,
+                                     encoder=preprocessor)
    train_dataset = ArrayDataset(x, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(x))
@ -428,7 +437,7 @@ def test_minimizer_fit_pandas(data_four_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimizer_params_categorical(cells_categorical):
@ -450,13 +459,14 @@ def test_minimizer_params_categorical(cells_categorical):
    # Now we have a full prediction pipeline.
    target_accuracy = 0.5
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, cells=cells)
+                                     categorical_features=categorical_features, cells=cells,
+                                     encoder=preprocessor)
    train_dataset = ArrayDataset(x, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(x))

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimizer_fit_qi(data_three_features):
@ -484,7 +494,7 @@ def test_minimizer_fit_qi(data_three_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimizer_fit_pandas_qi(data_five_features):
@ -508,7 +518,8 @@ def test_minimizer_fit_pandas_qi(data_five_features):
    # Now we have a full prediction pipeline.
    target_accuracy = 0.5
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, features_to_minimize=qi)
+                                     categorical_features=categorical_features, features_to_minimize=qi,
+                                     encoder=preprocessor)
    train_dataset = ArrayDataset(x, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(x))
@ -523,7 +534,7 @@ def test_minimizer_fit_pandas_qi(data_five_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimize_ndarray_iris():
@ -552,7 +563,7 @@ def test_minimize_ndarray_iris():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimize_pandas_adult():
@ -582,7 +593,8 @@ def test_minimize_pandas_adult():
        predictions = np.argmax(predictions, axis=1)
    target_accuracy = 0.7
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, features_to_minimize=qi)
+                                     categorical_features=categorical_features, features_to_minimize=qi,
+                                     encoder=preprocessor)
    gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
    transformed = gen.transform(dataset=ArrayDataset(x_train))
    gener = gen.generalizations
@ -609,7 +621,7 @@ def test_minimize_pandas_adult():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_german_credit_pandas():
@ -637,7 +649,8 @@ def test_german_credit_pandas():
        predictions = np.argmax(predictions, axis=1)
    target_accuracy = 0.7
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, features_to_minimize=qi)
+                                     categorical_features=categorical_features, features_to_minimize=qi,
+                                     encoder=preprocessor)
    gen.fit(dataset=ArrayDataset(x_train, predictions))
    transformed = gen.transform(dataset=ArrayDataset(x_train))
    gener = gen.generalizations
@ -666,7 +679,7 @@ def test_german_credit_pandas():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_regression(diabetes_dataset):
@ -726,7 +739,7 @@ def test_regression(diabetes_dataset):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_x_y():
@ -766,7 +779,7 @@ def test_x_y():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_x_y_features_names():
@ -806,7 +819,7 @@ def test_x_y_features_names():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_BaseEstimator_classification(data_five_features):
@ -828,7 +841,8 @@ def test_BaseEstimator_classification(data_five_features):
    # Now we have a full prediction pipeline.
    target_accuracy = 0.5
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, features_to_minimize=QI)
+                                     categorical_features=categorical_features, features_to_minimize=QI,
+                                     encoder=preprocessor)
    train_dataset = ArrayDataset(x, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(x))
@ -844,7 +858,7 @@ def test_BaseEstimator_classification(data_five_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_BaseEstimator_regression(diabetes_dataset):
@ -903,7 +917,7 @@ def test_BaseEstimator_regression(diabetes_dataset):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(transformed, predictions)
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_keras_model():
@ -936,7 +950,39 @@ def test_keras_model():
    check_ncp(ncp, gener)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
+
+
+class PytorchModel(nn.Module):
+
+    def __init__(self, num_classes, num_features):
+        super(PytorchModel, self).__init__()
+
+        self.fc1 = nn.Sequential(
+            nn.Linear(num_features, 1024),
+            nn.Tanh(), )
+
+        self.fc2 = nn.Sequential(
+            nn.Linear(1024, 512),
+            nn.Tanh(), )
+
+        self.fc3 = nn.Sequential(
+            nn.Linear(512, 256),
+            nn.Tanh(), )
+
+        self.fc4 = nn.Sequential(
+            nn.Linear(256, 128),
+            nn.Tanh(),
+        )
+
+        self.classifier = nn.Linear(128, num_classes)
+
+    def forward(self, x):
+        out = self.fc1(x)
+        out = self.fc2(out)
+        out = self.fc3(out)
+        out = self.fc4(out)
+        return self.classifier(out)


 def test_minimizer_pytorch(data_three_features):
@ -944,49 +990,17 @@ def test_minimizer_pytorch(data_three_features):
    x = x.astype(np.float32)
    qi = ['age', 'weight']

-    from torch import nn, optim
    from apt.utils.datasets.datasets import PytorchData
    from apt.utils.models.pytorch_model import PyTorchClassifier

-    class pytorch_model(nn.Module):
-
-        def __init__(self, num_classes, num_features):
-            super(pytorch_model, self).__init__()
-
-            self.fc1 = nn.Sequential(
-                nn.Linear(num_features, 1024),
-                nn.Tanh(), )
-
-            self.fc2 = nn.Sequential(
-                nn.Linear(1024, 512),
-                nn.Tanh(), )
-
-            self.fc3 = nn.Sequential(
-                nn.Linear(512, 256),
-                nn.Tanh(), )
-
-            self.fc4 = nn.Sequential(
-                nn.Linear(256, 128),
-                nn.Tanh(),
-            )
-
-            self.classifier = nn.Linear(128, num_classes)
-
-        def forward(self, x):
-            out = self.fc1(x)
-            out = self.fc2(out)
-            out = self.fc3(out)
-            out = self.fc4(out)
-            return self.classifier(out)
-
-    base_est = pytorch_model(2, 3)
+    base_est = PytorchModel(2, 3)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(base_est.parameters(), lr=0.01)

    model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
                              optimizer=optimizer, input_shape=(3,),
                              nb_classes=2)
-    model.fit(PytorchData(x.astype(np.float32), y), save_entire_model=False, nb_epochs=10)
+    model.fit(PytorchData(x, y), save_entire_model=False, nb_epochs=10)

    ad = ArrayDataset(x)
    predictions = model.predict(ad)
@ -1006,7 +1020,41 @@ def test_minimizer_pytorch(data_three_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
+
+
+def test_minimizer_pytorch_iris():
+    features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
+    (x_train, y_train), _ = get_iris_dataset_np()
+    x_train = x_train.astype(np.float32)
+    qi = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
+
+    from apt.utils.datasets.datasets import PytorchData
+    from apt.utils.models.pytorch_model import PyTorchClassifier
+
+    base_est = PytorchModel(3, 4)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(base_est.parameters(), lr=0.01)
+
+    model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
+                              optimizer=optimizer, input_shape=(4,),
+                              nb_classes=3)
+    model.fit(PytorchData(x_train, y_train), save_entire_model=False, nb_epochs=10)
+
+    predictions = model.predict(ArrayDataset(x_train))
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)
+    target_accuracy = 0.99
+    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
+    transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
+    gener = gen.generalizations
+
+    check_features(features, gener, transformed, x_train)
+    ncp = gen.ncp.transform_score
+    check_ncp(ncp, gener)
+
+    rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_untouched():