mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
anonymizer works with numpy and return numpy/pandas as original dataset
This commit is contained in:
parent
7b788b9018
commit
3263f92bee
3 changed files with 44 additions and 45 deletions
|
|
@ -5,7 +5,7 @@ from collections import Counter
|
|||
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from apt.utils.datasets import ArrayDataset, DATA_ARRAY_TYPE
|
||||
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
|
||||
|
||||
from typing import Union, Optional
|
||||
|
||||
|
|
@ -17,7 +17,7 @@ class Anonymize:
|
|||
Based on the implementation described in: https://arxiv.org/abs/2007.13086
|
||||
"""
|
||||
|
||||
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
|
||||
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], features = None, categorical_features: Optional[list] = None,
|
||||
is_regression=False):
|
||||
"""
|
||||
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
|
||||
|
|
@ -37,8 +37,9 @@ class Anonymize:
|
|||
self.quasi_identifiers = quasi_identifiers
|
||||
self.categorical_features = categorical_features
|
||||
self.is_regression = is_regression
|
||||
self.features = features
|
||||
|
||||
def anonymize(self, dataset: ArrayDataset) -> DATA_ARRAY_TYPE:
|
||||
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
|
||||
"""
|
||||
Method for performing model-guided anonymization.
|
||||
|
||||
|
|
@ -47,18 +48,32 @@ class Anonymize:
|
|||
contain both numeric and categorical data.
|
||||
:return: An array containing the anonymized training dataset.
|
||||
"""
|
||||
if type(dataset.get_samples()) == np.ndarray:
|
||||
return self._anonymize_ndarray(dataset.get_samples().copy(), dataset.get_labels())
|
||||
else: # pandas
|
||||
if not self.categorical_features:
|
||||
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
|
||||
return self._anonymize_pandas(dataset.get_samples().copy(), dataset.get_labels())
|
||||
|
||||
if self.features:
|
||||
self._features = self.features
|
||||
# if features is None, use numbers instead of names
|
||||
elif dataset.get_samples().shape[0] != 0:
|
||||
self._features = [i for i in range(dataset.get_samples().shape[0])]
|
||||
else:
|
||||
self._features = None
|
||||
if self.quasi_identifiers and self.features:
|
||||
self.quasi_identifiers = [i for i,v in enumerate(self.features) if v in self.quasi_identifiers]
|
||||
if self.categorical_features and self.features:
|
||||
self.categorical_features = [i for i,v in enumerate(self.features) if v in self.categorical_features]
|
||||
|
||||
transformed = self._anonymize_ndarray(dataset.get_samples().copy(), dataset.get_labels())
|
||||
if dataset.is_numpy:
|
||||
return transformed
|
||||
else:
|
||||
return pd.DataFrame(transformed, columns=self._features)
|
||||
|
||||
def _anonymize_ndarray(self, x, y):
|
||||
if x.shape[0] != y.shape[0]:
|
||||
raise ValueError("x and y should have same number of rows")
|
||||
x_anonymizer_train = x[:, self.quasi_identifiers]
|
||||
if x.dtype.kind not in 'iufc':
|
||||
if not self.categorical_features:
|
||||
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
|
||||
x_prepared = self._modify_categorical_features(x_anonymizer_train)
|
||||
else:
|
||||
x_prepared = x_anonymizer_train
|
||||
|
|
@ -71,22 +86,6 @@ class Anonymize:
|
|||
cells_by_id = self._calculate_cells(x, x_prepared)
|
||||
return self._anonymize_data_numpy(x, x_prepared, cells_by_id)
|
||||
|
||||
def _anonymize_pandas(self, x, y):
|
||||
if x.shape[0] != y.shape[0]:
|
||||
raise ValueError("x and y should have same number of rows")
|
||||
x_anonymizer_train = x.loc[:, self.quasi_identifiers]
|
||||
# need to one-hot encode before training the decision tree
|
||||
x_prepared = self._modify_categorical_features(x_anonymizer_train)
|
||||
if self.is_regression:
|
||||
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||
else:
|
||||
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||
if len(y.shape) > 1:
|
||||
y = np.argmax(y, axis=1)
|
||||
self.anonymizer.fit(x_prepared, y)
|
||||
cells_by_id = self._calculate_cells(x, x_prepared)
|
||||
return self._anonymize_data_pandas(x, x_prepared, cells_by_id)
|
||||
|
||||
def _calculate_cells(self, x, x_anonymizer_train):
|
||||
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
|
||||
cells_by_id = {}
|
||||
|
|
@ -155,16 +154,6 @@ class Anonymize:
|
|||
row[feature] = cell['representative'][feature]
|
||||
return x
|
||||
|
||||
def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id):
|
||||
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
|
||||
index = 0
|
||||
for i, row in x.iterrows():
|
||||
cell = cells[index]
|
||||
index += 1
|
||||
for feature in cell['representative']:
|
||||
x.at[i, feature] = cell['representative'][feature]
|
||||
return x
|
||||
|
||||
def _modify_categorical_features(self, x):
|
||||
encoder = OneHotEncoder()
|
||||
one_hot_encoded = encoder.fit_transform(x)
|
||||
|
|
|
|||
|
|
@ -24,13 +24,15 @@ OUTPUT_DATA_ARRAY_TYPE = np.ndarray
|
|||
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
|
||||
|
||||
|
||||
def array2numpy(arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
converts from INPUT_DATA_ARRAY_TYPE to numpy array
|
||||
"""
|
||||
if type(arr) == np.ndarray:
|
||||
self.is_numpy = True
|
||||
return arr
|
||||
if type(arr) == pd.DataFrame:
|
||||
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
|
||||
self.is_numpy = False
|
||||
return arr.to_numpy()
|
||||
if isinstance(arr, list):
|
||||
return np.array(arr)
|
||||
|
|
@ -169,8 +171,9 @@ class ArrayDataset(Dataset):
|
|||
:param y: collection of labels (optional)
|
||||
:param kwargs: dataset parameters
|
||||
"""
|
||||
self._x = array2numpy(x)
|
||||
self._y = array2numpy(y) if y is not None else None
|
||||
self.is_numpy = True
|
||||
self._y = array2numpy(self, y) if y is not None else None
|
||||
self._x = array2numpy(self, x)
|
||||
|
||||
if y is not None and len(self._x) != len(self._y):
|
||||
raise ValueError('Non equivalent lengths of x and y')
|
||||
|
|
|
|||
|
|
@ -7,10 +7,12 @@ from apt.anonymization import Anonymize
|
|||
from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
|
||||
from sklearn.datasets import load_diabetes
|
||||
from sklearn.model_selection import train_test_split
|
||||
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
|
||||
|
||||
|
||||
def test_anonymize_ndarray_iris():
|
||||
(x_train, y_train), _ = get_iris_dataset()
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
pred = model.predict(x_train)
|
||||
|
|
@ -18,7 +20,7 @@ def test_anonymize_ndarray_iris():
|
|||
k = 10
|
||||
QI = [0, 2]
|
||||
anonymizer = Anonymize(k, QI)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
|
||||
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
|
||||
assert (np.min(counts_elements) >= k)
|
||||
|
|
@ -33,20 +35,25 @@ def test_anonymize_pandas_adult():
|
|||
pred = model.predict(encoded)
|
||||
|
||||
k = 100
|
||||
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
|
||||
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'native-country']
|
||||
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'native-country']
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features, features=features)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
|
||||
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
# print(type(x_train['hours-per-week'][0]))
|
||||
|
||||
|
||||
|
||||
def test_anonymize_pandas_nursery():
|
||||
(x_train, y_train), _ = get_nursery_dataset()
|
||||
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"]
|
||||
x_train = x_train.astype(str)
|
||||
encoded = OneHotEncoder().fit_transform(x_train)
|
||||
model = DecisionTreeClassifier()
|
||||
|
|
@ -56,8 +63,8 @@ def test_anonymize_pandas_nursery():
|
|||
k = 100
|
||||
QI = ["finance", "social", "health"]
|
||||
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features, features=features)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
|
||||
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||
|
|
@ -75,7 +82,7 @@ def test_regression():
|
|||
k = 10
|
||||
QI = [0, 2, 5, 8]
|
||||
anonymizer = Anonymize(k, QI, is_regression=True)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
|
||||
model.fit(anon, y_train)
|
||||
print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue