fix docstring and fix assert in test

2026-06-08 15:05:13 +02:00 · 2022-03-22 13:59:28 +02:00 · 2022-03-22 13:59:28 +02:00 · 312469212e
commit 312469212e
parent 5b3476071f
2 changed files with 10 additions and 18 deletions
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -22,7 +22,7 @@ class Anonymize:
        """
        :param k: The privacy parameter that determines the number of records that will be indistinguishable from each
                  other (when looking at the quasi identifiers). Should be at least 2.
-        :param quasi_identifiers: The indexes of features that need to be minimized in case of pandas data.
+        :param quasi_identifiers: The indexes of features that need to be minimized.
        :param categorical_features: The list of categorical features indexes
        :param is_regression: Boolean param indicates that is is a regression problem.
        """
@ -35,26 +35,22 @@ class Anonymize:
        self.quasi_identifiers = quasi_identifiers
        self.categorical_features = categorical_features
        self.is_regression = is_regression
-        self.features = None

    def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
        """
        Method for performing model-guided anonymization.

        :param dataset: Data wrapper containing the training data for the model and the predictions of the
-                        original model on the training data. If implemented with a pandas dataframe, may
-                        contain both numeric and categorical data.
+                        original model on the training data.
        :return: An array containing the anonymized training dataset.
        """
-        self.features = dataset.features_names
-        if self.features is not None:
-            self._features = self.features
+        if dataset.features_names is not None:
+            self._features = dataset.features_names
            # if features is None, use numbers instead of names
        elif dataset.get_samples().shape[0] != 0:
            self._features = [i for i in range(dataset.get_samples().shape[0])]
        else:
-            self._features = None
-            assert False
+            raise ValueError('No data provided')

        transformed = self._anonymize_ndarray(dataset.get_samples().copy(), dataset.get_labels())
        if dataset.is_pandas:
@ -68,7 +64,7 @@ class Anonymize:
        x_anonymizer_train = x[:, self.quasi_identifiers]
        if x.dtype.kind not in 'iufc':
            if not self.categorical_features:
-                raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
+                raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
            x_prepared = self._modify_categorical_features(x_anonymizer_train)
        else:
            x_prepared = x_anonymizer_train
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@ -29,7 +29,6 @@ def test_anonymize_ndarray_iris():

 def test_anonymize_pandas_adult():
    (x_train, y_train), _ = get_adult_dataset()
-    print(type(x_train['hours-per-week'][0]))
    encoded = OneHotEncoder().fit_transform(x_train)
    model = DecisionTreeClassifier()
    model.fit(encoded, y_train)
@ -49,10 +48,7 @@ def test_anonymize_pandas_adult():

    assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
    assert (anon.loc[:, QI].value_counts().min() >= k)
-    #assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
-    print(type(x_train['hours-per-week'][0]))
-
-
+    np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))

 def test_anonymize_pandas_nursery():
    (x_train, y_train), _ = get_nursery_dataset()
@ -73,7 +69,7 @@ def test_anonymize_pandas_nursery():

    assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
    assert (anon.loc[:, QI].value_counts().min() >= k)
-    # assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+    np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))


 def test_regression():
@ -107,7 +103,7 @@ def test_errors():
    anonymizer = Anonymize(10, [0, 2])
    (x_train, y_train), (x_test, y_test) = get_iris_dataset()
    with pytest.raises(ValueError):
-        anonymizer.anonymize(x_train, y_test)
+        anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
    (x_train, y_train), _ = get_adult_dataset()
    with pytest.raises(ValueError):
-        anonymizer.anonymize(x_train, y_train)
+        anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))