mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-29 15:59:38 +02:00
Support 1-hot encoded features in anonymization + fixes related to encoding in minimization (#86)
* Support 1-hot encoded features in anonymization (#72) * Fix anonymization adult notebook + new notebook to demonstrate anonymization on 1-hot encoded data * Minimizer: No default encoder, if none provided data is supplied to the model as is. Fix data type of representative values. Fix and add more tests. Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
26addd192f
commit
5dce961092
7 changed files with 670 additions and 255 deletions
|
|
@ -23,7 +23,11 @@ class Anonymize:
|
|||
:type k: int
|
||||
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
|
||||
in case of numpy data.
|
||||
:type quasi_identifiers: np.ndarray or list
|
||||
:type quasi_identifiers: np.ndarray or list of strings or integers.
|
||||
:param quasi_identifer_slices: If some of the quasi-identifiers represent 1-hot encoded features that need to remain
|
||||
consistent after anonymization, provide a list containing the list of column names
|
||||
or indexes that represent a single feature.
|
||||
:type quasi_identifer_slices: list of lists of strings or integers.
|
||||
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
|
||||
before using them to train the decision tree model).
|
||||
:type categorical_features: list, optional
|
||||
|
|
@ -35,8 +39,12 @@ class Anonymize:
|
|||
:type train_only_QI: boolean, optional
|
||||
"""
|
||||
|
||||
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
|
||||
is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
|
||||
def __init__(self, k: int,
|
||||
quasi_identifiers: Union[np.ndarray, list],
|
||||
quasi_identifer_slices: Optional[list] = None,
|
||||
categorical_features: Optional[list] = None,
|
||||
is_regression: Optional[bool] = False,
|
||||
train_only_QI: Optional[bool] = False):
|
||||
if k < 2:
|
||||
raise ValueError("k should be a positive integer with a value of 2 or higher")
|
||||
if quasi_identifiers is None or len(quasi_identifiers) < 1:
|
||||
|
|
@ -49,6 +57,7 @@ class Anonymize:
|
|||
self.train_only_QI = train_only_QI
|
||||
self.features_names = None
|
||||
self.features = None
|
||||
self.quasi_identifer_slices = quasi_identifer_slices
|
||||
|
||||
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
|
||||
"""
|
||||
|
|
@ -76,7 +85,14 @@ class Anonymize:
|
|||
if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
|
||||
raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
|
||||
'the data columns')
|
||||
# transform quasi identifiers to indexes
|
||||
self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
|
||||
if self.quasi_identifer_slices:
|
||||
temp_list = []
|
||||
for slice in self.quasi_identifer_slices:
|
||||
new_slice = [i for i, v in enumerate(self.features_names) if v in slice]
|
||||
temp_list.append(new_slice)
|
||||
self.quasi_identifer_slices = temp_list
|
||||
if self.categorical_features:
|
||||
self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]
|
||||
|
||||
|
|
@ -126,31 +142,49 @@ class Anonymize:
|
|||
return cells_by_id
|
||||
|
||||
def _find_representatives(self, x, x_anonymizer_train, cells):
|
||||
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
|
||||
# x is original data (always numpy), x_anonymizer_train is only QIs + 1-hot encoded
|
||||
node_ids = self._find_sample_nodes(x_anonymizer_train)
|
||||
if self.quasi_identifer_slices:
|
||||
all_one_hot_features = set([feature for encoded in self.quasi_identifer_slices for feature in encoded])
|
||||
else:
|
||||
all_one_hot_features = set()
|
||||
for cell in cells:
|
||||
cell['representative'] = {}
|
||||
# get all rows in cell
|
||||
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
|
||||
# TODO: should we filter only those with majority label? (using hist)
|
||||
rows = x[indexes]
|
||||
for feature in self.quasi_identifiers:
|
||||
values = rows[:, feature]
|
||||
if self.categorical_features and feature in self.categorical_features:
|
||||
# find most common value
|
||||
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
|
||||
else:
|
||||
# find the mean value (per feature)
|
||||
median = np.median(values)
|
||||
min_value = max(values)
|
||||
min_dist = float("inf")
|
||||
for value in values:
|
||||
# euclidean distance between two floating point values
|
||||
dist = abs(value - median)
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
min_value = value
|
||||
cell['representative'][feature] = min_value
|
||||
done = set()
|
||||
for feature in self.quasi_identifiers: # self.quasi_identifiers are numerical indexes
|
||||
if feature not in done:
|
||||
# deal with 1-hot encoded features
|
||||
if feature in all_one_hot_features:
|
||||
# find features that belong together
|
||||
for encoded in self.quasi_identifer_slices:
|
||||
if feature in encoded:
|
||||
values = rows[:, encoded]
|
||||
unique_rows, counts = np.unique(values, axis=0, return_counts=True)
|
||||
rep = unique_rows[np.argmax(counts)]
|
||||
for i, e in enumerate(encoded):
|
||||
done.add(e)
|
||||
cell['representative'][e] = rep[i]
|
||||
else: # rest of features
|
||||
values = rows[:, feature]
|
||||
if self.categorical_features and feature in self.categorical_features:
|
||||
# find most common value
|
||||
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
|
||||
else:
|
||||
# find the mean value (per feature)
|
||||
median = np.median(values)
|
||||
min_value = max(values)
|
||||
min_dist = float("inf")
|
||||
for value in values:
|
||||
# euclidean distance between two floating point values
|
||||
dist = abs(value - median)
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
min_value = value
|
||||
cell['representative'][feature] = min_value
|
||||
|
||||
def _find_sample_nodes(self, samples):
|
||||
paths = self._anonymizer.decision_path(samples).toarray()
|
||||
|
|
|
|||
|
|
@ -10,9 +10,6 @@ import copy
|
|||
import sys
|
||||
from scipy.spatial import distance
|
||||
from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
|
|
@ -57,7 +54,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
|
||||
encoded before using them to train the decision tree model).
|
||||
:param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
|
||||
features)
|
||||
features). If not provided, the data will be fed as is directly to the estimator.
|
||||
:type encoder: sklearn OrdinalEncoder or OneHotEncoder
|
||||
:type categorical_features: list of strings, optional
|
||||
:param features_to_minimize: The features to be minimized.
|
||||
|
|
@ -256,7 +253,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# Going to fit
|
||||
# (currently not dealing with option to fit with only X and y and no estimator)
|
||||
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
|
||||
dtype = dataset.get_samples().dtype
|
||||
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
|
||||
if not self.features_to_minimize:
|
||||
self.features_to_minimize = self._features
|
||||
|
|
@ -293,21 +289,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# collect feature data (such as min, max)
|
||||
self._feature_data = self._get_feature_data(x)
|
||||
|
||||
# default encoder in case none provided
|
||||
if self.encoder is None:
|
||||
numeric_features = [f for f in self._features if f not in self.categorical_features]
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
)
|
||||
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
self.encoder = ColumnTransformer(
|
||||
transformers=[
|
||||
("num", numeric_transformer, numeric_features),
|
||||
("cat", categorical_transformer, self.categorical_features),
|
||||
]
|
||||
)
|
||||
self.encoder.fit(x)
|
||||
|
||||
self.cells = []
|
||||
self._categorical_values = {}
|
||||
|
||||
|
|
@ -334,14 +315,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
|
||||
|
||||
# self._cells currently holds the generalization created from the tree leaves
|
||||
self._calculate_generalizations(x_test)
|
||||
if self.generalize_using_transform:
|
||||
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
||||
else:
|
||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||
generalized = self._generalize(x_test, x_prepared_test, nodes)
|
||||
|
||||
# check accuracy
|
||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), y_test))
|
||||
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
|
||||
print('Initial accuracy of model on generalized data, relative to original model predictions '
|
||||
'(base generalization derived from tree, before improvements): %f' % accuracy)
|
||||
|
||||
|
|
@ -364,15 +341,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
|
||||
|
||||
self._calculate_generalizations(x_test)
|
||||
if self.generalize_using_transform:
|
||||
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
||||
self._cells_by_id)
|
||||
else:
|
||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||
|
||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
|
||||
y_test))
|
||||
generalized = self._generalize(x_test, x_prepared_test, nodes)
|
||||
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
|
||||
# if accuracy passed threshold roll back to previous iteration generalizations
|
||||
if accuracy < self.target_accuracy:
|
||||
self.cells = cells_previous_iter
|
||||
|
|
@ -395,14 +365,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
if removed_feature is None:
|
||||
break
|
||||
|
||||
self._calculate_generalizations(x_test)
|
||||
if self.generalize_using_transform:
|
||||
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
||||
self._cells_by_id)
|
||||
else:
|
||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
|
||||
y_test))
|
||||
generalized = self._generalize(x_test, x_prepared_test, nodes)
|
||||
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
|
||||
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
|
||||
|
||||
# self._cells currently holds the chosen generalization based on target accuracy
|
||||
|
|
@ -893,7 +857,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
def _generalize_indexes(self, original_data, cells, all_indexes):
|
||||
# prepared data include one hot encoded categorical data + QI
|
||||
representatives = pd.DataFrame(columns=self._features) # empty except for columns
|
||||
dtypes = original_data.dtypes.to_dict()
|
||||
new_dtypes = {}
|
||||
for t in dtypes.keys():
|
||||
new_dtypes[t] = pd.Series(dtype=dtypes[t].name)
|
||||
representatives = pd.DataFrame(new_dtypes) # empty except for columns
|
||||
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
|
||||
|
||||
# iterate over cells (leaves in decision tree)
|
||||
|
|
@ -925,6 +893,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
return original_data_generalized
|
||||
|
||||
def _generalize(self, data, data_prepared, nodes):
|
||||
self._calculate_generalizations(data)
|
||||
if self.generalize_using_transform:
|
||||
generalized = self._generalize_from_tree(data, data_prepared, nodes, self.cells,
|
||||
self._cells_by_id)
|
||||
else:
|
||||
generalized = self._generalize_from_generalizations(data, self.generalizations)
|
||||
return generalized
|
||||
|
||||
@staticmethod
|
||||
def _map_to_ranges_categories(samples, ranges, categories):
|
||||
all_sample_indexes = []
|
||||
|
|
@ -994,18 +971,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
feature_data[feature],
|
||||
total)
|
||||
if feature_ncp > 0:
|
||||
# divide by accuracy gain
|
||||
new_cells = copy.deepcopy(self.cells)
|
||||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
|
||||
cells_by_id)
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
||||
labels)) - current_accuracy
|
||||
if accuracy_gain < 0:
|
||||
accuracy_gain = 0
|
||||
if accuracy_gain != 0:
|
||||
feature_ncp = feature_ncp / accuracy_gain
|
||||
feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
|
||||
feature_ncp, labels, current_accuracy)
|
||||
|
||||
if feature_ncp < range_min:
|
||||
range_min = feature_ncp
|
||||
|
|
@ -1021,19 +988,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
feature_data[feature],
|
||||
total)
|
||||
if feature_ncp > 0:
|
||||
# divide by accuracy loss
|
||||
new_cells = copy.deepcopy(self.cells)
|
||||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
|
||||
cells_by_id)
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
||||
labels)) - current_accuracy
|
||||
feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
|
||||
feature_ncp, labels, current_accuracy)
|
||||
|
||||
if accuracy_gain < 0:
|
||||
accuracy_gain = 0
|
||||
if accuracy_gain != 0:
|
||||
feature_ncp = feature_ncp / accuracy_gain
|
||||
if feature_ncp < range_min:
|
||||
range_min = feature_ncp
|
||||
remove_feature = feature
|
||||
|
|
@ -1063,6 +1020,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
feature_ncp += cell_ncp
|
||||
return feature_ncp
|
||||
|
||||
def _normalize_ncp_by_accuracy_gain(self, original_data, prepared_data, nodes, feature, feature_ncp, labels,
|
||||
current_accuracy):
|
||||
new_cells = copy.deepcopy(self.cells)
|
||||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
|
||||
cells_by_id)
|
||||
accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder)
|
||||
accuracy_gain = accuracy - current_accuracy
|
||||
if accuracy_gain < 0:
|
||||
accuracy_gain = 0
|
||||
if accuracy_gain != 0:
|
||||
feature_ncp = feature_ncp / accuracy_gain
|
||||
return feature_ncp
|
||||
|
||||
def _calculate_generalizations(self, samples: Optional[pd.DataFrame] = None):
|
||||
ranges, range_representatives = self._calculate_ranges(self.cells)
|
||||
categories, category_representatives = self._calculate_categories(self.cells)
|
||||
|
|
@ -1282,3 +1254,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
for feature in to_remove:
|
||||
del generalizations['categories'][feature]
|
||||
|
||||
@staticmethod
|
||||
def _calculate_accuracy(generalized, y_test, estimator, encoder):
|
||||
generalized_data = encoder.transform(generalized) if encoder else generalized
|
||||
return estimator.score(ArrayDataset(generalized_data, y_test))
|
||||
|
|
|
|||
|
|
@ -368,7 +368,7 @@ class PyTorchClassifier(PyTorchModel):
|
|||
if validation_data is None:
|
||||
self._art_model.fit(
|
||||
x=train_data.get_samples(),
|
||||
y=train_data.get_labels().reshape(-1, 1),
|
||||
y=train_data.get_labels(),
|
||||
batch_size=batch_size,
|
||||
nb_epochs=nb_epochs,
|
||||
save_checkpoints=save_checkpoints,
|
||||
|
|
@ -379,9 +379,9 @@ class PyTorchClassifier(PyTorchModel):
|
|||
else:
|
||||
self._art_model.fit(
|
||||
x=train_data.get_samples(),
|
||||
y=train_data.get_labels().reshape(-1, 1),
|
||||
y=train_data.get_labels(),
|
||||
x_validation=validation_data.get_samples(),
|
||||
y_validation=validation_data.get_labels().reshape(-1, 1),
|
||||
y_validation=validation_data.get_labels(),
|
||||
batch_size=batch_size,
|
||||
nb_epochs=nb_epochs,
|
||||
save_checkpoints=save_checkpoints,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue