mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-24 20:36:21 +02:00
fix docstring and fix assert in test
This commit is contained in:
parent
5b3476071f
commit
312469212e
2 changed files with 10 additions and 18 deletions
|
|
@ -22,7 +22,7 @@ class Anonymize:
|
|||
"""
|
||||
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
|
||||
other (when looking at the quasi identifiers). Should be at least 2.
|
||||
:param quasi_identifiers: The indexes of features that need to be minimized in case of pandas data.
|
||||
:param quasi_identifiers: The indexes of features that need to be minimized.
|
||||
:param categorical_features: The list of categorical features indexes
|
||||
:param is_regression: Boolean param indicates that is is a regression problem.
|
||||
"""
|
||||
|
|
@ -35,26 +35,22 @@ class Anonymize:
|
|||
self.quasi_identifiers = quasi_identifiers
|
||||
self.categorical_features = categorical_features
|
||||
self.is_regression = is_regression
|
||||
self.features = None
|
||||
|
||||
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
|
||||
"""
|
||||
Method for performing model-guided anonymization.
|
||||
|
||||
:param dataset: Data wrapper containing the training data for the model and the predictions of the
|
||||
original model on the training data. If implemented with a pandas dataframe, may
|
||||
contain both numeric and categorical data.
|
||||
original model on the training data.
|
||||
:return: An array containing the anonymized training dataset.
|
||||
"""
|
||||
self.features = dataset.features_names
|
||||
if self.features is not None:
|
||||
self._features = self.features
|
||||
if dataset.features_names is not None:
|
||||
self._features = dataset.features_names
|
||||
# if features is None, use numbers instead of names
|
||||
elif dataset.get_samples().shape[0] != 0:
|
||||
self._features = [i for i in range(dataset.get_samples().shape[0])]
|
||||
else:
|
||||
self._features = None
|
||||
assert False
|
||||
raise ValueError('No data provided')
|
||||
|
||||
transformed = self._anonymize_ndarray(dataset.get_samples().copy(), dataset.get_labels())
|
||||
if dataset.is_pandas:
|
||||
|
|
@ -68,7 +64,7 @@ class Anonymize:
|
|||
x_anonymizer_train = x[:, self.quasi_identifiers]
|
||||
if x.dtype.kind not in 'iufc':
|
||||
if not self.categorical_features:
|
||||
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
|
||||
raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
|
||||
x_prepared = self._modify_categorical_features(x_anonymizer_train)
|
||||
else:
|
||||
x_prepared = x_anonymizer_train
|
||||
|
|
|
|||
|
|
@ -29,7 +29,6 @@ def test_anonymize_ndarray_iris():
|
|||
|
||||
def test_anonymize_pandas_adult():
|
||||
(x_train, y_train), _ = get_adult_dataset()
|
||||
print(type(x_train['hours-per-week'][0]))
|
||||
encoded = OneHotEncoder().fit_transform(x_train)
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(encoded, y_train)
|
||||
|
|
@ -49,10 +48,7 @@ def test_anonymize_pandas_adult():
|
|||
|
||||
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||
#assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
print(type(x_train['hours-per-week'][0]))
|
||||
|
||||
|
||||
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
|
||||
|
||||
def test_anonymize_pandas_nursery():
|
||||
(x_train, y_train), _ = get_nursery_dataset()
|
||||
|
|
@ -73,7 +69,7 @@ def test_anonymize_pandas_nursery():
|
|||
|
||||
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||
# assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
|
||||
|
||||
|
||||
def test_regression():
|
||||
|
|
@ -107,7 +103,7 @@ def test_errors():
|
|||
anonymizer = Anonymize(10, [0, 2])
|
||||
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
|
||||
with pytest.raises(ValueError):
|
||||
anonymizer.anonymize(x_train, y_test)
|
||||
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
|
||||
(x_train, y_train), _ = get_adult_dataset()
|
||||
with pytest.raises(ValueError):
|
||||
anonymizer.anonymize(x_train, y_train)
|
||||
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue