fix docstring and fix assert in test

This commit is contained in:
olasaadi 2022-03-22 13:59:28 +02:00
parent 5b3476071f
commit 312469212e
2 changed files with 10 additions and 18 deletions

View file

@ -22,7 +22,7 @@ class Anonymize:
"""
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
:param quasi_identifiers: The indexes of features that need to be minimized in case of pandas data.
:param quasi_identifiers: The indexes of features that need to be minimized.
:param categorical_features: The list of categorical features indexes
:param is_regression: Boolean param indicates that is is a regression problem.
"""
@ -35,26 +35,22 @@ class Anonymize:
self.quasi_identifiers = quasi_identifiers
self.categorical_features = categorical_features
self.is_regression = is_regression
self.features = None
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
"""
Method for performing model-guided anonymization.
:param dataset: Data wrapper containing the training data for the model and the predictions of the
original model on the training data. If implemented with a pandas dataframe, may
contain both numeric and categorical data.
original model on the training data.
:return: An array containing the anonymized training dataset.
"""
self.features = dataset.features_names
if self.features is not None:
self._features = self.features
if dataset.features_names is not None:
self._features = dataset.features_names
# if features is None, use numbers instead of names
elif dataset.get_samples().shape[0] != 0:
self._features = [i for i in range(dataset.get_samples().shape[0])]
else:
self._features = None
assert False
raise ValueError('No data provided')
transformed = self._anonymize_ndarray(dataset.get_samples().copy(), dataset.get_labels())
if dataset.is_pandas:
@ -68,7 +64,7 @@ class Anonymize:
x_anonymizer_train = x[:, self.quasi_identifiers]
if x.dtype.kind not in 'iufc':
if not self.categorical_features:
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
x_prepared = self._modify_categorical_features(x_anonymizer_train)
else:
x_prepared = x_anonymizer_train

View file

@ -29,7 +29,6 @@ def test_anonymize_ndarray_iris():
def test_anonymize_pandas_adult():
(x_train, y_train), _ = get_adult_dataset()
print(type(x_train['hours-per-week'][0]))
encoded = OneHotEncoder().fit_transform(x_train)
model = DecisionTreeClassifier()
model.fit(encoded, y_train)
@ -49,10 +48,7 @@ def test_anonymize_pandas_adult():
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
assert (anon.loc[:, QI].value_counts().min() >= k)
#assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
print(type(x_train['hours-per-week'][0]))
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
def test_anonymize_pandas_nursery():
(x_train, y_train), _ = get_nursery_dataset()
@ -73,7 +69,7 @@ def test_anonymize_pandas_nursery():
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
assert (anon.loc[:, QI].value_counts().min() >= k)
# assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
def test_regression():
@ -107,7 +103,7 @@ def test_errors():
anonymizer = Anonymize(10, [0, 2])
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
with pytest.raises(ValueError):
anonymizer.anonymize(x_train, y_test)
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
(x_train, y_train), _ = get_adult_dataset()
with pytest.raises(ValueError):
anonymizer.anonymize(x_train, y_train)
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))