mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-24 20:36:21 +02:00
categorical features and QI passed by indexes
dataset include feature names and is_pandas param
This commit is contained in:
parent
3263f92bee
commit
8aa7bb8281
3 changed files with 26 additions and 27 deletions
|
|
@ -17,15 +17,13 @@ class Anonymize:
|
|||
Based on the implementation described in: https://arxiv.org/abs/2007.13086
|
||||
"""
|
||||
|
||||
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], features = None, categorical_features: Optional[list] = None,
|
||||
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
|
||||
is_regression=False):
|
||||
"""
|
||||
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
|
||||
other (when looking at the quasi identifiers). Should be at least 2.
|
||||
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
|
||||
in case of numpy data.
|
||||
:param categorical_features: The list of categorical features (should only be supplied when passing data as a
|
||||
pandas dataframe.
|
||||
:param quasi_identifiers: The indexes of features that need to be minimized in case of pandas data.
|
||||
:param categorical_features: The list of categorical features indexes
|
||||
:param is_regression: Boolean param indicates that is is a regression problem.
|
||||
"""
|
||||
if k < 2:
|
||||
|
|
@ -37,7 +35,7 @@ class Anonymize:
|
|||
self.quasi_identifiers = quasi_identifiers
|
||||
self.categorical_features = categorical_features
|
||||
self.is_regression = is_regression
|
||||
self.features = features
|
||||
self.features = None
|
||||
|
||||
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
|
||||
"""
|
||||
|
|
@ -48,24 +46,21 @@ class Anonymize:
|
|||
contain both numeric and categorical data.
|
||||
:return: An array containing the anonymized training dataset.
|
||||
"""
|
||||
|
||||
if self.features:
|
||||
self.features = dataset.features_names
|
||||
if self.features is not None:
|
||||
self._features = self.features
|
||||
# if features is None, use numbers instead of names
|
||||
elif dataset.get_samples().shape[0] != 0:
|
||||
self._features = [i for i in range(dataset.get_samples().shape[0])]
|
||||
else:
|
||||
self._features = None
|
||||
if self.quasi_identifiers and self.features:
|
||||
self.quasi_identifiers = [i for i,v in enumerate(self.features) if v in self.quasi_identifiers]
|
||||
if self.categorical_features and self.features:
|
||||
self.categorical_features = [i for i,v in enumerate(self.features) if v in self.categorical_features]
|
||||
assert False
|
||||
|
||||
transformed = self._anonymize_ndarray(dataset.get_samples().copy(), dataset.get_labels())
|
||||
if dataset.is_numpy:
|
||||
return transformed
|
||||
else:
|
||||
if dataset.is_pandas:
|
||||
return pd.DataFrame(transformed, columns=self._features)
|
||||
else:
|
||||
return transformed
|
||||
|
||||
def _anonymize_ndarray(self, x, y):
|
||||
if x.shape[0] != y.shape[0]:
|
||||
|
|
@ -111,10 +106,7 @@ class Anonymize:
|
|||
# get all rows in cell
|
||||
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
|
||||
# TODO: should we filter only those with majority label? (using hist)
|
||||
if type(x) == np.ndarray:
|
||||
rows = x[indexes]
|
||||
else: # pandas
|
||||
rows = x.iloc[indexes]
|
||||
rows = x[indexes]
|
||||
for feature in self.quasi_identifiers:
|
||||
if type(x) == np.ndarray:
|
||||
values = rows[:, feature]
|
||||
|
|
|
|||
|
|
@ -29,10 +29,9 @@ def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
|
|||
converts from INPUT_DATA_ARRAY_TYPE to numpy array
|
||||
"""
|
||||
if type(arr) == np.ndarray:
|
||||
self.is_numpy = True
|
||||
return arr
|
||||
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
|
||||
self.is_numpy = False
|
||||
self.is_pandas = True
|
||||
return arr.to_numpy()
|
||||
if isinstance(arr, list):
|
||||
return np.array(arr)
|
||||
|
|
@ -171,9 +170,12 @@ class ArrayDataset(Dataset):
|
|||
:param y: collection of labels (optional)
|
||||
:param kwargs: dataset parameters
|
||||
"""
|
||||
self.is_numpy = True
|
||||
self.is_pandas = False
|
||||
self.features_names = None
|
||||
self._y = array2numpy(self, y) if y is not None else None
|
||||
self._x = array2numpy(self, x)
|
||||
if self.is_pandas:
|
||||
self.features_names = x.columns
|
||||
|
||||
if y is not None and len(self._x) != len(self._y):
|
||||
raise ValueError('Non equivalent lengths of x and y')
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ def test_anonymize_ndarray_iris():
|
|||
|
||||
def test_anonymize_pandas_adult():
|
||||
(x_train, y_train), _ = get_adult_dataset()
|
||||
print(type(x_train['hours-per-week'][0]))
|
||||
encoded = OneHotEncoder().fit_transform(x_train)
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(encoded, y_train)
|
||||
|
|
@ -41,13 +42,15 @@ def test_anonymize_pandas_adult():
|
|||
'native-country']
|
||||
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'native-country']
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features, features=features)
|
||||
QI_indexes = [i for i, v in enumerate(features) if v in QI]
|
||||
categorical_features_indexes = [i for i, v in enumerate(features) if v in categorical_features]
|
||||
anonymizer = Anonymize(k, QI_indexes, categorical_features=categorical_features_indexes)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
|
||||
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
# print(type(x_train['hours-per-week'][0]))
|
||||
#assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
print(type(x_train['hours-per-week'][0]))
|
||||
|
||||
|
||||
|
||||
|
|
@ -63,12 +66,14 @@ def test_anonymize_pandas_nursery():
|
|||
k = 100
|
||||
QI = ["finance", "social", "health"]
|
||||
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features, features=features)
|
||||
QI_indexes = [i for i, v in enumerate(features) if v in QI]
|
||||
categorical_features_indexes = [i for i, v in enumerate(features) if v in categorical_features]
|
||||
anonymizer = Anonymize(k, QI_indexes, categorical_features=categorical_features_indexes)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
|
||||
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
# assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
|
||||
|
||||
def test_regression():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue