mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-24 20:36:21 +02:00
Support for one-hot encoded features in minimization (#87)
* Initial version with first working test * Make sure representative values in generalizations for 1-hot encoded features are consistent. * Updated notebooks for one-hot encoded data * Review comments Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
5dce961092
commit
6d81cd8ed4
4 changed files with 26703 additions and 48 deletions
|
|
@ -56,9 +56,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
:param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
|
||||
features). If not provided, the data will be fed as is directly to the estimator.
|
||||
:type encoder: sklearn OrdinalEncoder or OneHotEncoder
|
||||
:type categorical_features: list of strings, optional
|
||||
:param features_to_minimize: The features to be minimized.
|
||||
:type categorical_features: list of strings or integers, optional
|
||||
:param features_to_minimize: The features to be minimized. If not provided, all features will be minimized.
|
||||
:type features_to_minimize: list of strings or int, optional
|
||||
:param feature_slices: If some of the features to be minimized represent 1-hot encoded features that need to remain
|
||||
consistent after minimization, provide a list containing the list of column names
|
||||
or indexes that represent a single feature.
|
||||
:type feature_slices: list of lists of strings or integers, optional
|
||||
:param train_only_features_to_minimize: Whether to train the tree just on the ``features_to_minimize`` or on all
|
||||
features. Default is only on ``features_to_minimize``.
|
||||
:type train_only_features_to_minimize: boolean, optional
|
||||
|
|
@ -79,6 +83,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
categorical_features: Optional[Union[np.ndarray, list]] = None,
|
||||
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
|
||||
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
|
||||
feature_slices: Optional[list] = None,
|
||||
train_only_features_to_minimize: Optional[bool] = True,
|
||||
is_regression: Optional[bool] = False,
|
||||
generalize_using_transform: bool = True):
|
||||
|
|
@ -91,12 +96,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||
self.target_accuracy = target_accuracy
|
||||
self.cells = cells
|
||||
if cells:
|
||||
self._calculate_generalizations()
|
||||
self.categorical_features = []
|
||||
if categorical_features:
|
||||
self.categorical_features = categorical_features
|
||||
self.features_to_minimize = features_to_minimize
|
||||
self.feature_slices = feature_slices
|
||||
if self.feature_slices:
|
||||
self.all_one_hot_features = {str(feature) for encoded in self.feature_slices for feature in encoded}
|
||||
else:
|
||||
self.all_one_hot_features = set()
|
||||
self.train_only_features_to_minimize = train_only_features_to_minimize
|
||||
self.is_regression = is_regression
|
||||
self.encoder = encoder
|
||||
|
|
@ -107,6 +115,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self._dt = None
|
||||
self._features = None
|
||||
self._level = 0
|
||||
if cells:
|
||||
self._calculate_generalizations()
|
||||
|
||||
def get_params(self, deep=True):
|
||||
"""
|
||||
|
|
@ -121,6 +131,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
ret['target_accuracy'] = self.target_accuracy
|
||||
ret['categorical_features'] = self.categorical_features
|
||||
ret['features_to_minimize'] = self.features_to_minimize
|
||||
ret['feature_slices'] = self.feature_slices
|
||||
ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
|
||||
ret['is_regression'] = self.is_regression
|
||||
ret['estimator'] = self.estimator
|
||||
|
|
@ -151,6 +162,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self.categorical_features = params['categorical_features']
|
||||
if 'features_to_minimize' in params:
|
||||
self.features_to_minimize = params['features_to_minimize']
|
||||
if 'feature_slices' in params:
|
||||
self.feature_slices = params['feature_slices']
|
||||
if 'train_only_features_to_minimize' in params:
|
||||
self.train_only_features_to_minimize = params['train_only_features_to_minimize']
|
||||
if 'is_regression' in params:
|
||||
|
|
@ -259,6 +272,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self.features_to_minimize = [str(i) for i in self.features_to_minimize]
|
||||
if not all(elem in self._features for elem in self.features_to_minimize):
|
||||
raise ValueError('features to minimize should be a subset of features names')
|
||||
if self.feature_slices:
|
||||
temp_list = []
|
||||
for slice in self.feature_slices:
|
||||
new_slice = [str(i) for i in slice]
|
||||
if not all(elem in self._features for elem in new_slice):
|
||||
raise ValueError('features in slices should be a subset of features names')
|
||||
temp_list.append(new_slice)
|
||||
self.feature_slices = temp_list
|
||||
x_qi = x.loc[:, self.features_to_minimize]
|
||||
|
||||
# divide dataset into train and test
|
||||
|
|
@ -325,8 +346,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# if accuracy above threshold, improve generalization
|
||||
if accuracy > self.target_accuracy:
|
||||
print('Improving generalizations')
|
||||
self._level = 1
|
||||
self._level = 0
|
||||
while accuracy > self.target_accuracy:
|
||||
self._level += 1
|
||||
cells_previous_iter = self.cells
|
||||
generalization_prev_iter = self._generalizations
|
||||
cells_by_id_prev = self._cells_by_id
|
||||
|
|
@ -352,7 +374,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
break
|
||||
else:
|
||||
print('Pruned tree to level: %d, new relative accuracy: %f' % (self._level, accuracy))
|
||||
self._level += 1
|
||||
|
||||
# if accuracy below threshold, improve accuracy by removing features from generalization
|
||||
elif accuracy < self.target_accuracy:
|
||||
|
|
@ -375,6 +396,16 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
x_test_dataset = ArrayDataset(x_test, features_names=self._features)
|
||||
self._ncp_scores.fit_score = self.calculate_ncp(x_test_dataset)
|
||||
self._ncp_scores.generalizations_score = self.calculate_ncp(x_test_dataset)
|
||||
else:
|
||||
print('No fitting was performed as some information was missing')
|
||||
if not self.estimator:
|
||||
print('No estimator provided')
|
||||
elif not dataset:
|
||||
print('No data provided')
|
||||
elif dataset.get_samples() is None:
|
||||
print('No samples provided')
|
||||
elif dataset.get_labels() is None:
|
||||
print('No labels provided')
|
||||
|
||||
# Return the transformer
|
||||
return self
|
||||
|
|
@ -579,7 +610,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
elif feature in cell['untouched']:
|
||||
continue
|
||||
else:
|
||||
raise TypeError("feature " + feature + "not found in cell" + cell['id'])
|
||||
raise TypeError("feature " + str(feature) + " not found in cell " + str(cell['id']))
|
||||
# Mark as mapped
|
||||
mapped.itemset(index, 1)
|
||||
return True
|
||||
|
|
@ -703,6 +734,32 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# categorical feature can not have this value
|
||||
if categorical_value in new_cell['categories'][categorical_feature]:
|
||||
new_cell['categories'][categorical_feature].remove(categorical_value)
|
||||
# features that were already one-hot encoded. Legal values should be 0 or 1
|
||||
elif feature in self.all_one_hot_features:
|
||||
if feature not in new_cell['categories'].keys():
|
||||
new_cell['categories'][feature] = []
|
||||
if feature in cell['ranges']:
|
||||
range = cell['ranges'][feature]
|
||||
if range['start'] is None and range['end'] < 1:
|
||||
feature_value = 0
|
||||
elif range['end'] is None and range['start'] > 0:
|
||||
feature_value = 1
|
||||
else:
|
||||
raise ValueError('Illegal range for 1-hot encoded feature')
|
||||
new_cell['categories'][feature] = [feature_value]
|
||||
|
||||
# need to add other columns that represent same 1-hot encoded feature
|
||||
|
||||
# search for feature group:
|
||||
other_features, encoded = self._get_other_features_in_encoding(feature, self.feature_slices)
|
||||
for other_feature in other_features:
|
||||
if feature_value == 1:
|
||||
new_cell['categories'][other_feature] = [0]
|
||||
elif len(encoded) == 2:
|
||||
new_cell['categories'][other_feature] = [1]
|
||||
elif (other_feature not in new_cell['categories'].keys()
|
||||
or len(new_cell['categories'][other_feature]) == 0):
|
||||
new_cell['categories'][other_feature] = [0, 1]
|
||||
else:
|
||||
if feature in cell['ranges'].keys():
|
||||
new_cell['ranges'][feature] = cell['ranges'][feature]
|
||||
|
|
@ -813,6 +870,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
min_dist = dist
|
||||
min = i
|
||||
i = i + 1
|
||||
# since this is an actual row from the data, correct one-hot encoding is already guaranteed
|
||||
row = match_rows.iloc[min]
|
||||
for feature in cell['ranges'].keys():
|
||||
cell['representative'][feature] = row[feature]
|
||||
|
|
@ -861,6 +919,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
new_dtypes = {}
|
||||
for t in dtypes.keys():
|
||||
new_dtypes[t] = pd.Series(dtype=dtypes[t].name)
|
||||
dtypes[t] = dtypes[t].name
|
||||
representatives = pd.DataFrame(new_dtypes) # empty except for columns
|
||||
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
|
||||
|
||||
|
|
@ -891,6 +950,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
replace = pd.DataFrame(replace, indexes, columns=self._features)
|
||||
original_data_generalized.loc[indexes, representatives.columns.tolist()] = replace
|
||||
|
||||
original_data_generalized = original_data_generalized.astype(dtype=dtypes)
|
||||
return original_data_generalized
|
||||
|
||||
def _generalize(self, data, data_prepared, nodes):
|
||||
|
|
@ -1024,7 +1084,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
current_accuracy):
|
||||
new_cells = copy.deepcopy(self.cells)
|
||||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
self._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
|
||||
cells_by_id)
|
||||
accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder)
|
||||
|
|
@ -1050,17 +1110,31 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# categorical - use most common value
|
||||
old_category_representatives = category_representatives
|
||||
category_representatives = {}
|
||||
done = set()
|
||||
for feature in self._generalizations['categories']:
|
||||
category_representatives[feature] = []
|
||||
for g_index, group in enumerate(self._generalizations['categories'][feature]):
|
||||
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
|
||||
if indexes:
|
||||
rows = samples.iloc[indexes]
|
||||
values = rows[feature]
|
||||
category = Counter(values).most_common(1)[0][0]
|
||||
category_representatives[feature].append(category)
|
||||
else:
|
||||
category_representatives[feature].append(old_category_representatives[feature][g_index])
|
||||
if feature not in done:
|
||||
category_representatives[feature] = []
|
||||
for g_index, group in enumerate(self._generalizations['categories'][feature]):
|
||||
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
|
||||
if indexes:
|
||||
rows = samples.iloc[indexes]
|
||||
if feature in self.all_one_hot_features:
|
||||
other_features, encoded = self._get_other_features_in_encoding(feature,
|
||||
self.feature_slices)
|
||||
values = rows.loc[:, encoded].to_numpy()
|
||||
unique_rows, counts = np.unique(values, axis=0, return_counts=True)
|
||||
rep = unique_rows[np.argmax(counts)]
|
||||
for i, e in enumerate(encoded):
|
||||
done.add(e)
|
||||
if e not in category_representatives.keys():
|
||||
category_representatives[e] = []
|
||||
category_representatives[e].append(rep[i])
|
||||
else:
|
||||
values = rows[feature]
|
||||
category = Counter(values).most_common(1)[0][0]
|
||||
category_representatives[feature].append(category)
|
||||
else:
|
||||
category_representatives[feature].append(old_category_representatives[feature][g_index])
|
||||
|
||||
# numerical - use actual value closest to mean
|
||||
old_range_representatives = range_representatives
|
||||
|
|
@ -1169,35 +1243,55 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
range_representatives[feature].append(prev_value + 1)
|
||||
return ranges, range_representatives
|
||||
|
||||
@staticmethod
|
||||
def _calculate_categories(cells):
|
||||
def _calculate_categories(self, cells):
|
||||
categories = {}
|
||||
category_representatives = {}
|
||||
categorical_features_values = GeneralizeToRepresentative._calculate_categorical_features_values(cells)
|
||||
assigned_features = set()
|
||||
for feature in categorical_features_values.keys():
|
||||
partitions = []
|
||||
category_representatives[feature] = []
|
||||
values = categorical_features_values[feature]
|
||||
assigned = []
|
||||
assigned_values = set()
|
||||
for i in range(len(values)):
|
||||
value1 = values[i]
|
||||
if value1 in assigned:
|
||||
if value1 in assigned_values:
|
||||
continue
|
||||
partition = [value1]
|
||||
assigned.append(value1)
|
||||
assigned_values.add(value1)
|
||||
for j in range(len(values)):
|
||||
if j <= i:
|
||||
continue
|
||||
value2 = values[j]
|
||||
if GeneralizeToRepresentative._are_inseparable(cells, feature, value1, value2):
|
||||
partition.append(value2)
|
||||
assigned.append(value2)
|
||||
assigned_values.add(value2)
|
||||
partitions.append(partition)
|
||||
# default representative values (computed with no data)
|
||||
category_representatives[feature].append(partition[0]) # random
|
||||
# for 1-hot encoded features, the first encountered feature will get the value 1 and the rest 0
|
||||
if len(partition) > 1 and feature in self.all_one_hot_features:
|
||||
other_features, _ = self._get_other_features_in_encoding(feature, self.feature_slices)
|
||||
assigned = False
|
||||
for other_feature in other_features:
|
||||
if other_feature in assigned_features:
|
||||
category_representatives[feature].append(0)
|
||||
assigned = True
|
||||
break
|
||||
if not assigned:
|
||||
category_representatives[feature].append(1)
|
||||
assigned_features.add(feature)
|
||||
else:
|
||||
category_representatives[feature].append(partition[0]) # random
|
||||
categories[feature] = partitions
|
||||
return categories, category_representatives
|
||||
|
||||
@staticmethod
|
||||
def _get_other_features_in_encoding(feature, feature_slices):
|
||||
for encoded in feature_slices:
|
||||
if feature in encoded:
|
||||
return (list(set(encoded) - {feature})), encoded
|
||||
return [], []
|
||||
|
||||
@staticmethod
|
||||
def _calculate_categorical_features_values(cells):
|
||||
categorical_features_values = {}
|
||||
|
|
@ -1229,16 +1323,25 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
untouched = untouched.intersection(*untouched_lists)
|
||||
return list(untouched)
|
||||
|
||||
def _remove_feature_from_cells(self, cells, cells_by_id, feature):
|
||||
if feature in self.all_one_hot_features:
|
||||
for encoded in self.feature_slices:
|
||||
if feature in encoded:
|
||||
self._remove_feature_from_cells_internal(cells, cells_by_id, encoded)
|
||||
else:
|
||||
self._remove_feature_from_cells_internal(cells, cells_by_id, [feature])
|
||||
|
||||
@staticmethod
|
||||
def _remove_feature_from_cells(cells, cells_by_id, feature):
|
||||
def _remove_feature_from_cells_internal(cells, cells_by_id, features):
|
||||
for cell in cells:
|
||||
if 'untouched' not in cell:
|
||||
cell['untouched'] = []
|
||||
if feature in cell['ranges'].keys():
|
||||
del cell['ranges'][feature]
|
||||
elif feature in cell['categories'].keys():
|
||||
del cell['categories'][feature]
|
||||
cell['untouched'].append(feature)
|
||||
for feature in features:
|
||||
if feature in cell['ranges'].keys():
|
||||
del cell['ranges'][feature]
|
||||
elif feature in cell['categories'].keys():
|
||||
del cell['categories'][feature]
|
||||
cell['untouched'].append(feature)
|
||||
cells_by_id[cell['id']] = cell.copy()
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this tutorial we will show how to anonymize models using the ML anonymization module, specifically when the inout data is already one-hot encoded. \n",
|
||||
"In this tutorial we will show how to anonymize models using the ML anonymization module, specifically when the input data is already one-hot encoded. \n",
|
||||
"\n",
|
||||
"This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/adult). "
|
||||
]
|
||||
|
|
@ -25,7 +25,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -81,7 +81,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -123,14 +123,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Base model accuracy: 0.814446287083103\n"
|
||||
"Base model accuracy: 0.8143234445058657\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -168,7 +168,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -194,14 +194,14 @@
|
|||
"# QI = (race, sex)\n",
|
||||
"QI = [53, 52, 51, 50, 49, 48, 47]\n",
|
||||
"QI_slices = [[47, 48, 49, 50, 51], [52, 53]]\n",
|
||||
"anonymizer = Anonymize(100, QI)\n",
|
||||
"anonymizer = Anonymize(100, QI, quasi_identifer_slices=QI_slices)\n",
|
||||
"anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
|
||||
"print(anon)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -210,7 +210,7 @@
|
|||
"2711"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -222,7 +222,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -231,7 +231,7 @@
|
|||
"2476"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -250,14 +250,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Anonymized model accuracy: 0.8135863890424421\n"
|
||||
"Anonymized model accuracy: 0.8124808058473066\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
|||
26325
notebooks/minimization_one_hot_adult.ipynb
Normal file
26325
notebooks/minimization_one_hot_adult.ipynb
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -181,8 +181,8 @@ def compare_generalizations(gener, expected_generalizations):
|
|||
== set(gener['range_representatives'][key]))
|
||||
if 'category_representatives' in expected_generalizations:
|
||||
for key in expected_generalizations['category_representatives']:
|
||||
assert (set([frozenset(sl) for sl in expected_generalizations['category_representatives'][key]])
|
||||
== set([frozenset(sl) for sl in gener['category_representatives'][key]]))
|
||||
assert (set(expected_generalizations['category_representatives'][key])
|
||||
== set(gener['category_representatives'][key]))
|
||||
|
||||
|
||||
def check_features(features, expected_generalizations, transformed, x, pandas=False):
|
||||
|
|
@ -200,9 +200,9 @@ def check_features(features, expected_generalizations, transformed, x, pandas=Fa
|
|||
if features[i] in modified_features:
|
||||
indexes.append(i)
|
||||
if len(indexes) != transformed.shape[1]:
|
||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x, indexes, axis=1)).all())
|
||||
assert (np.array_equal(np.delete(transformed, indexes, axis=1), np.delete(x, indexes, axis=1)))
|
||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||
assert (((transformed[indexes]) != (x[indexes])).any())
|
||||
assert (not np.array_equal(transformed[:, indexes], x[:, indexes]))
|
||||
|
||||
|
||||
def check_ncp(ncp, expected_generalizations):
|
||||
|
|
@ -920,6 +920,233 @@ def test_BaseEstimator_regression(diabetes_dataset):
|
|||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_minimizer_ndarray_one_hot():
|
||||
x_train = np.array([[23, 0, 1, 165],
|
||||
[45, 0, 1, 158],
|
||||
[56, 1, 0, 123],
|
||||
[67, 0, 1, 154],
|
||||
[45, 1, 0, 149],
|
||||
[42, 1, 0, 166],
|
||||
[73, 0, 1, 172],
|
||||
[94, 0, 1, 168],
|
||||
[69, 0, 1, 175],
|
||||
[24, 1, 0, 181],
|
||||
[18, 1, 0, 190]])
|
||||
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
predictions = model.predict(x_train)
|
||||
|
||||
features = ['0', '1', '2', '3']
|
||||
QI = [0, 1, 2]
|
||||
QI_slices = [[1, 2]]
|
||||
target_accuracy = 0.7
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations
|
||||
expected_generalizations = {'categories': {}, 'category_representatives': {},
|
||||
'range_representatives': {'0': [34.5]},
|
||||
'ranges': {'0': [34.5]}, 'untouched': ['3', '1', '2']}
|
||||
|
||||
compare_generalizations(gener, expected_generalizations)
|
||||
|
||||
check_features(features, expected_generalizations, transformed, x_train)
|
||||
ncp = gen.ncp.transform_score
|
||||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_minimizer_ndarray_one_hot_gen():
|
||||
x_train = np.array([[23, 0, 1, 165],
|
||||
[45, 0, 1, 158],
|
||||
[56, 1, 0, 123],
|
||||
[67, 0, 1, 154],
|
||||
[45, 1, 0, 149],
|
||||
[42, 1, 0, 166],
|
||||
[73, 0, 1, 172],
|
||||
[94, 0, 1, 168],
|
||||
[69, 0, 1, 175],
|
||||
[24, 1, 0, 181],
|
||||
[18, 1, 0, 190]])
|
||||
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
predictions = model.predict(x_train)
|
||||
|
||||
features = ['0', '1', '2', '3']
|
||||
QI = [0, 1, 2]
|
||||
QI_slices = [[1, 2]]
|
||||
target_accuracy = 0.2
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations
|
||||
expected_generalizations = {'categories': {'1': [[0, 1]], '2': [[0, 1]]},
|
||||
'category_representatives': {'1': [0], '2': [1]},
|
||||
'range_representatives': {'0': []}, 'ranges': {'0': []}, 'untouched': ['3']}
|
||||
|
||||
compare_generalizations(gener, expected_generalizations)
|
||||
|
||||
check_features(features, expected_generalizations, transformed, x_train)
|
||||
ncp = gen.ncp.transform_score
|
||||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_minimizer_ndarray_one_hot_multi():
|
||||
x_train = np.array([[23, 0, 1, 0, 0, 1, 165],
|
||||
[45, 0, 1, 0, 0, 1, 158],
|
||||
[56, 1, 0, 0, 0, 1, 123],
|
||||
[67, 0, 1, 1, 0, 0, 154],
|
||||
[45, 1, 0, 1, 0, 0, 149],
|
||||
[42, 1, 0, 1, 0, 0, 166],
|
||||
[73, 0, 1, 0, 0, 1, 172],
|
||||
[94, 0, 1, 0, 1, 0, 168],
|
||||
[69, 0, 1, 0, 1, 0, 175],
|
||||
[24, 1, 0, 0, 1, 0, 181],
|
||||
[18, 1, 0, 0, 0, 1, 190]])
|
||||
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
predictions = model.predict(x_train)
|
||||
|
||||
features = ['0', '1', '2', '3', '4', '5', '6']
|
||||
QI = [0, 1, 2, 3, 4, 5]
|
||||
QI_slices = [[1, 2], [3, 4, 5]]
|
||||
target_accuracy = 0.2
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations
|
||||
expected_generalizations = {'categories':
|
||||
{'1': [[0, 1]], '2': [[0, 1]], '3': [[0, 1]], '4': [[0, 1]], '5': [[0, 1]]},
|
||||
'category_representatives': {'1': [0], '2': [1], '3': [0], '4': [1], '5': [0]},
|
||||
'range_representatives': {'0': []}, 'ranges': {'0': []}, 'untouched': ['6']}
|
||||
|
||||
compare_generalizations(gener, expected_generalizations)
|
||||
|
||||
check_features(features, expected_generalizations, transformed, x_train)
|
||||
ncp = gen.ncp.transform_score
|
||||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
transformed_slice = transformed[:, QI_slices[1]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_minimizer_ndarray_one_hot_multi2():
|
||||
x_train = np.array([[0, 0, 1],
|
||||
[0, 0, 1],
|
||||
[0, 1, 0],
|
||||
[0, 1, 0],
|
||||
[1, 0, 0],
|
||||
[1, 0, 0]])
|
||||
y_train = np.array([1, 1, 2, 2, 0, 0])
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
predictions = model.predict(x_train)
|
||||
|
||||
features = ['0', '1', '2']
|
||||
QI = [0, 1, 2]
|
||||
QI_slices = [[0, 1, 2]]
|
||||
target_accuracy = 0.2
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations
|
||||
expected_generalizations = {'categories': {'0': [[0, 1]], '1': [[0, 1]], '2': [[0, 1]]},
|
||||
'category_representatives': {'0': [0], '1': [0], '2': [1]}, 'range_representatives': {},
|
||||
'ranges': {}, 'untouched': []}
|
||||
|
||||
compare_generalizations(gener, expected_generalizations)
|
||||
|
||||
check_features(features, expected_generalizations, transformed, x_train)
|
||||
ncp = gen.ncp.transform_score
|
||||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_anonymize_pandas_one_hot():
|
||||
features = ["age", "gender_M", "gender_F", "height"]
|
||||
x_train = np.array([[23, 0, 1, 165],
|
||||
[45, 0, 1, 158],
|
||||
[56, 1, 0, 123],
|
||||
[67, 0, 1, 154],
|
||||
[45, 1, 0, 149],
|
||||
[42, 1, 0, 166],
|
||||
[73, 0, 1, 172],
|
||||
[94, 0, 1, 168],
|
||||
[69, 0, 1, 175],
|
||||
[24, 1, 0, 181],
|
||||
[18, 1, 0, 190]])
|
||||
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
x_train = pd.DataFrame(x_train, columns=features)
|
||||
y_train = pd.Series(y_train)
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
predictions = model.predict(x_train)
|
||||
|
||||
QI = ["age", "gender_M", "gender_F"]
|
||||
QI_slices = [["gender_M", "gender_F"]]
|
||||
target_accuracy = 0.7
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations
|
||||
expected_generalizations = {'categories': {}, 'category_representatives': {},
|
||||
'range_representatives': {'age': [34.5]},
|
||||
'ranges': {'age': [34.5]}, 'untouched': ['height', 'gender_M', 'gender_F']}
|
||||
|
||||
compare_generalizations(gener, expected_generalizations)
|
||||
|
||||
check_features(features, expected_generalizations, transformed, x_train, True)
|
||||
ncp = gen.ncp.transform_score
|
||||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed.loc[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_keras_model():
|
||||
(x, y), (x_test, y_test) = get_iris_dataset_np()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue