Initial version with first working test

Signed-off-by: abigailt <abigailt@il.ibm.com>
2026-06-23 15:48:06 +02:00 · 2023-11-15 08:21:40 -05:00 · 2023-11-15 08:21:40 -05:00 · 2a657388af
commit 2a657388af
parent 5dce961092
2 changed files with 150 additions and 11 deletions
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@ -200,9 +200,9 @@ def check_features(features, expected_generalizations, transformed, x, pandas=Fa
            if features[i] in modified_features:
                indexes.append(i)
        if len(indexes) != transformed.shape[1]:
-            assert ((np.delete(transformed, indexes, axis=1) == np.delete(x, indexes, axis=1)).all())
+            assert (np.array_equal(np.delete(transformed, indexes, axis=1), np.delete(x, indexes, axis=1)))
        if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
-            assert (((transformed[indexes]) != (x[indexes])).any())
+            assert (not np.array_equal(transformed[:, indexes], x[:, indexes]))


 def check_ncp(ncp, expected_generalizations):
@ -920,6 +920,82 @@ def test_BaseEstimator_regression(diabetes_dataset):
    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


+def test_minimizer_ndarray_one_hot():
+    x_train = np.array([[23, 0, 1, 165],
+                        [45, 0, 1, 158],
+                        [56, 1, 0, 123],
+                        [67, 0, 1, 154],
+                        [45, 1, 0, 149],
+                        [42, 1, 0, 166],
+                        [73, 0, 1, 172],
+                        [94, 0, 1, 168],
+                        [69, 0, 1, 175],
+                        [24, 1, 0, 181],
+                        [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    predictions = model.predict(x_train)
+
+    features = ['0', '1', '2', '3']
+    QI = [0, 1, 2]
+    QI_slices = [[1, 2]]
+    target_accuracy = 0.7
+    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
+                                     features_to_minimize=QI)
+    gen.fit(dataset=ArrayDataset(x_train, predictions))
+    transformed = gen.transform(dataset=ArrayDataset(x_train))
+    gener = gen.generalizations
+    expected_generalizations = {'categories': {}, 'category_representatives': {},
+                                'range_representatives': {'0': [34.5]},
+                                'ranges': {'0': [34.5]}, 'untouched': ['3', '1', '2']}
+
+    compare_generalizations(gener, expected_generalizations)
+
+    check_features(features, expected_generalizations, transformed, x_train)
+    ncp = gen.ncp.transform_score
+    check_ncp(ncp, expected_generalizations)
+
+    rel_accuracy = model.score(transformed, predictions)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
+
+
+def test_anonymize_pandas_one_hot():
+    feature_names = ["age", "gender_M", "gender_F", "height"]
+    x_train = np.array([[23, 0, 1, 165],
+                        [45, 0, 1, 158],
+                        [56, 1, 0, 123],
+                        [67, 0, 1, 154],
+                        [45, 1, 0, 149],
+                        [42, 1, 0, 166],
+                        [73, 0, 1, 172],
+                        [94, 0, 1, 168],
+                        [69, 0, 1, 175],
+                        [24, 1, 0, 181],
+                        [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+    x_train = pd.DataFrame(x_train, columns=feature_names)
+    y_train = pd.Series(y_train)
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    pred = model.predict(x_train)
+
+    k = 10
+    QI = ["age", "gender_M", "gender_F"]
+    QI_slices = [["gender_M", "gender_F"]]
+    anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
+    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
+    assert (anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
+    assert (anon.loc[:, QI].value_counts().min() >= k)
+    np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
+    anonymized_slice = anon.loc[:, QI_slices[0]]
+    assert ((np.sum(anonymized_slice, axis=1) == 1).all())
+    assert ((np.max(anonymized_slice, axis=1) == 1).all())
+    assert ((np.min(anonymized_slice, axis=1) == 0).all())
+
+
 def test_keras_model():
    (x, y), (x_test, y_test) = get_iris_dataset_np()