Support 1-hot encoded features in anonymization + fixes related to encoding in minimization (#86)

* Support 1-hot encoded features in anonymization (#72)
* Fix anonymization adult notebook + new notebook to demonstrate anonymization on 1-hot encoded data

* Minimizer: No default encoder, if none provided data is supplied to the model as is. Fix data type of representative values. Fix and add more tests.

Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
abigailgold 2023-10-19 11:48:15 +03:00 committed by GitHub
parent 26addd192f
commit 5dce961092
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 670 additions and 255 deletions

View file

@ -23,7 +23,11 @@ class Anonymize:
:type k: int
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
:type quasi_identifiers: np.ndarray or list
:type quasi_identifiers: np.ndarray or list of strings or integers.
:param quasi_identifer_slices: If some of the quasi-identifiers represent 1-hot encoded features that need to remain
consistent after anonymization, provide a list containing the list of column names
or indexes that represent a single feature.
:type quasi_identifer_slices: list of lists of strings or integers.
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
before using them to train the decision tree model).
:type categorical_features: list, optional
@ -35,8 +39,12 @@ class Anonymize:
:type train_only_QI: boolean, optional
"""
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
def __init__(self, k: int,
quasi_identifiers: Union[np.ndarray, list],
quasi_identifer_slices: Optional[list] = None,
categorical_features: Optional[list] = None,
is_regression: Optional[bool] = False,
train_only_QI: Optional[bool] = False):
if k < 2:
raise ValueError("k should be a positive integer with a value of 2 or higher")
if quasi_identifiers is None or len(quasi_identifiers) < 1:
@ -49,6 +57,7 @@ class Anonymize:
self.train_only_QI = train_only_QI
self.features_names = None
self.features = None
self.quasi_identifer_slices = quasi_identifer_slices
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
"""
@ -76,7 +85,14 @@ class Anonymize:
if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
'the data columns')
# transform quasi identifiers to indexes
self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
if self.quasi_identifer_slices:
temp_list = []
for slice in self.quasi_identifer_slices:
new_slice = [i for i, v in enumerate(self.features_names) if v in slice]
temp_list.append(new_slice)
self.quasi_identifer_slices = temp_list
if self.categorical_features:
self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]
@ -126,31 +142,49 @@ class Anonymize:
return cells_by_id
def _find_representatives(self, x, x_anonymizer_train, cells):
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
# x is original data (always numpy), x_anonymizer_train is only QIs + 1-hot encoded
node_ids = self._find_sample_nodes(x_anonymizer_train)
if self.quasi_identifer_slices:
all_one_hot_features = set([feature for encoded in self.quasi_identifer_slices for feature in encoded])
else:
all_one_hot_features = set()
for cell in cells:
cell['representative'] = {}
# get all rows in cell
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
# TODO: should we filter only those with majority label? (using hist)
rows = x[indexes]
for feature in self.quasi_identifiers:
values = rows[:, feature]
if self.categorical_features and feature in self.categorical_features:
# find most common value
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
else:
# find the mean value (per feature)
median = np.median(values)
min_value = max(values)
min_dist = float("inf")
for value in values:
# euclidean distance between two floating point values
dist = abs(value - median)
if dist < min_dist:
min_dist = dist
min_value = value
cell['representative'][feature] = min_value
done = set()
for feature in self.quasi_identifiers: # self.quasi_identifiers are numerical indexes
if feature not in done:
# deal with 1-hot encoded features
if feature in all_one_hot_features:
# find features that belong together
for encoded in self.quasi_identifer_slices:
if feature in encoded:
values = rows[:, encoded]
unique_rows, counts = np.unique(values, axis=0, return_counts=True)
rep = unique_rows[np.argmax(counts)]
for i, e in enumerate(encoded):
done.add(e)
cell['representative'][e] = rep[i]
else: # rest of features
values = rows[:, feature]
if self.categorical_features and feature in self.categorical_features:
# find most common value
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
else:
# find the mean value (per feature)
median = np.median(values)
min_value = max(values)
min_dist = float("inf")
for value in values:
# euclidean distance between two floating point values
dist = abs(value - median)
if dist < min_dist:
min_dist = dist
min_value = value
cell['representative'][feature] = min_value
def _find_sample_nodes(self, samples):
paths = self._anonymizer.decision_path(samples).toarray()

View file

@ -10,9 +10,6 @@ import copy
import sys
from scipy.spatial import distance
from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.utils.validation import check_is_fitted
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
@ -57,7 +54,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
encoded before using them to train the decision tree model).
:param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
features)
features). If not provided, the data will be fed as is directly to the estimator.
:type encoder: sklearn OrdinalEncoder or OneHotEncoder
:type categorical_features: list of strings, optional
:param features_to_minimize: The features to be minimized.
@ -256,7 +253,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# Going to fit
# (currently not dealing with option to fit with only X and y and no estimator)
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
dtype = dataset.get_samples().dtype
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
if not self.features_to_minimize:
self.features_to_minimize = self._features
@ -293,21 +289,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# collect feature data (such as min, max)
self._feature_data = self._get_feature_data(x)
# default encoder in case none provided
if self.encoder is None:
numeric_features = [f for f in self._features if f not in self.categorical_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
self.encoder = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, self.categorical_features),
]
)
self.encoder.fit(x)
self.cells = []
self._categorical_values = {}
@ -334,14 +315,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
# self._cells currently holds the generalization created from the tree leaves
self._calculate_generalizations(x_test)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
generalized = self._generalize(x_test, x_prepared_test, nodes)
# check accuracy
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), y_test))
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
print('Initial accuracy of model on generalized data, relative to original model predictions '
'(base generalization derived from tree, before improvements): %f' % accuracy)
@ -364,15 +341,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
self._calculate_generalizations(x_test)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
y_test))
generalized = self._generalize(x_test, x_prepared_test, nodes)
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
# if accuracy passed threshold roll back to previous iteration generalizations
if accuracy < self.target_accuracy:
self.cells = cells_previous_iter
@ -395,14 +365,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if removed_feature is None:
break
self._calculate_generalizations(x_test)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
y_test))
generalized = self._generalize(x_test, x_prepared_test, nodes)
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self._cells currently holds the chosen generalization based on target accuracy
@ -893,7 +857,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def _generalize_indexes(self, original_data, cells, all_indexes):
# prepared data include one hot encoded categorical data + QI
representatives = pd.DataFrame(columns=self._features) # empty except for columns
dtypes = original_data.dtypes.to_dict()
new_dtypes = {}
for t in dtypes.keys():
new_dtypes[t] = pd.Series(dtype=dtypes[t].name)
representatives = pd.DataFrame(new_dtypes) # empty except for columns
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
# iterate over cells (leaves in decision tree)
@ -925,6 +893,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return original_data_generalized
def _generalize(self, data, data_prepared, nodes):
self._calculate_generalizations(data)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(data, data_prepared, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(data, self.generalizations)
return generalized
@staticmethod
def _map_to_ranges_categories(samples, ranges, categories):
all_sample_indexes = []
@ -994,18 +971,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
feature_data[feature],
total)
if feature_ncp > 0:
# divide by accuracy gain
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
labels)) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
feature_ncp = feature_ncp / accuracy_gain
feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
feature_ncp, labels, current_accuracy)
if feature_ncp < range_min:
range_min = feature_ncp
@ -1021,19 +988,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
feature_data[feature],
total)
if feature_ncp > 0:
# divide by accuracy loss
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
labels)) - current_accuracy
feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
feature_ncp, labels, current_accuracy)
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
feature_ncp = feature_ncp / accuracy_gain
if feature_ncp < range_min:
range_min = feature_ncp
remove_feature = feature
@ -1063,6 +1020,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
feature_ncp += cell_ncp
return feature_ncp
def _normalize_ncp_by_accuracy_gain(self, original_data, prepared_data, nodes, feature, feature_ncp, labels,
current_accuracy):
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
cells_by_id)
accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder)
accuracy_gain = accuracy - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
feature_ncp = feature_ncp / accuracy_gain
return feature_ncp
def _calculate_generalizations(self, samples: Optional[pd.DataFrame] = None):
ranges, range_representatives = self._calculate_ranges(self.cells)
categories, category_representatives = self._calculate_categories(self.cells)
@ -1282,3 +1254,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
for feature in to_remove:
del generalizations['categories'][feature]
@staticmethod
def _calculate_accuracy(generalized, y_test, estimator, encoder):
generalized_data = encoder.transform(generalized) if encoder else generalized
return estimator.score(ArrayDataset(generalized_data, y_test))

View file

@ -368,7 +368,7 @@ class PyTorchClassifier(PyTorchModel):
if validation_data is None:
self._art_model.fit(
x=train_data.get_samples(),
y=train_data.get_labels().reshape(-1, 1),
y=train_data.get_labels(),
batch_size=batch_size,
nb_epochs=nb_epochs,
save_checkpoints=save_checkpoints,
@ -379,9 +379,9 @@ class PyTorchClassifier(PyTorchModel):
else:
self._art_model.fit(
x=train_data.get_samples(),
y=train_data.get_labels().reshape(-1, 1),
y=train_data.get_labels(),
x_validation=validation_data.get_samples(),
y_validation=validation_data.get_labels().reshape(-1, 1),
y_validation=validation_data.get_labels(),
batch_size=batch_size,
nb_epochs=nb_epochs,
save_checkpoints=save_checkpoints,