mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
More tests and fixes. Make sure representative values in generalizations for 1-hot encoded features are consistent.
Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
904462a6a8
commit
c122fc7387
2 changed files with 126 additions and 32 deletions
|
|
@ -96,8 +96,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||
self.target_accuracy = target_accuracy
|
||||
self.cells = cells
|
||||
if cells:
|
||||
self._calculate_generalizations()
|
||||
self.categorical_features = []
|
||||
if categorical_features:
|
||||
self.categorical_features = categorical_features
|
||||
|
|
@ -117,6 +115,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self._dt = None
|
||||
self._features = None
|
||||
self._level = 0
|
||||
if cells:
|
||||
self._calculate_generalizations()
|
||||
|
||||
def get_params(self, deep=True):
|
||||
"""
|
||||
|
|
@ -741,19 +741,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# need to add other columns that represent same 1-hot encoded feature
|
||||
|
||||
# search for feature group:
|
||||
for encoded in self.feature_slices:
|
||||
if feature in encoded:
|
||||
other_features = list(set(encoded) - set([feature]))
|
||||
for other_feature in other_features:
|
||||
if other_feature not in new_cell['categories'].keys():
|
||||
new_cell['categories'][other_feature] = []
|
||||
if feature_value == 1:
|
||||
new_cell['categories'][other_feature].append(0)
|
||||
elif len(encoded) == 2:
|
||||
new_cell['categories'][other_feature].append(1)
|
||||
else:
|
||||
new_cell['categories'][other_feature].append(0)
|
||||
new_cell['categories'][other_feature].append(1)
|
||||
other_features, encoded = self._get_other_features_in_encoding(feature, self.feature_slices)
|
||||
for other_feature in other_features:
|
||||
if other_feature not in new_cell['categories'].keys():
|
||||
new_cell['categories'][other_feature] = []
|
||||
if feature_value == 1:
|
||||
new_cell['categories'][other_feature].append(0)
|
||||
elif len(encoded) == 2:
|
||||
new_cell['categories'][other_feature].append(1)
|
||||
else:
|
||||
new_cell['categories'][other_feature].append(0)
|
||||
new_cell['categories'][other_feature].append(1)
|
||||
else:
|
||||
if feature in cell['ranges'].keys():
|
||||
new_cell['ranges'][feature] = cell['ranges'][feature]
|
||||
|
|
@ -1104,17 +1102,31 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# categorical - use most common value
|
||||
old_category_representatives = category_representatives
|
||||
category_representatives = {}
|
||||
done = set()
|
||||
for feature in self._generalizations['categories']:
|
||||
category_representatives[feature] = []
|
||||
for g_index, group in enumerate(self._generalizations['categories'][feature]):
|
||||
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
|
||||
if indexes:
|
||||
rows = samples.iloc[indexes]
|
||||
values = rows[feature]
|
||||
category = Counter(values).most_common(1)[0][0]
|
||||
category_representatives[feature].append(category)
|
||||
else:
|
||||
category_representatives[feature].append(old_category_representatives[feature][g_index])
|
||||
if feature not in done:
|
||||
category_representatives[feature] = []
|
||||
for g_index, group in enumerate(self._generalizations['categories'][feature]):
|
||||
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
|
||||
if indexes:
|
||||
rows = samples.iloc[indexes]
|
||||
if feature in self.all_one_hot_features:
|
||||
other_features, encoded = self._get_other_features_in_encoding(feature,
|
||||
self.feature_slices)
|
||||
values = rows.loc[:, encoded].to_numpy()
|
||||
unique_rows, counts = np.unique(values, axis=0, return_counts=True)
|
||||
rep = unique_rows[np.argmax(counts)]
|
||||
for i, e in enumerate(encoded):
|
||||
done.add(e)
|
||||
if e not in category_representatives.keys():
|
||||
category_representatives[e] = []
|
||||
category_representatives[e].append(rep[i])
|
||||
else:
|
||||
values = rows[feature]
|
||||
category = Counter(values).most_common(1)[0][0]
|
||||
category_representatives[feature].append(category)
|
||||
else:
|
||||
category_representatives[feature].append(old_category_representatives[feature][g_index])
|
||||
|
||||
# numerical - use actual value closest to mean
|
||||
old_range_representatives = range_representatives
|
||||
|
|
@ -1223,35 +1235,55 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
range_representatives[feature].append(prev_value + 1)
|
||||
return ranges, range_representatives
|
||||
|
||||
@staticmethod
|
||||
def _calculate_categories(cells):
|
||||
def _calculate_categories(self, cells):
|
||||
categories = {}
|
||||
category_representatives = {}
|
||||
categorical_features_values = GeneralizeToRepresentative._calculate_categorical_features_values(cells)
|
||||
assigned_features = set()
|
||||
for feature in categorical_features_values.keys():
|
||||
partitions = []
|
||||
category_representatives[feature] = []
|
||||
values = categorical_features_values[feature]
|
||||
assigned = []
|
||||
assigned_values = set()
|
||||
for i in range(len(values)):
|
||||
value1 = values[i]
|
||||
if value1 in assigned:
|
||||
if value1 in assigned_values:
|
||||
continue
|
||||
partition = [value1]
|
||||
assigned.append(value1)
|
||||
assigned_values.add(value1)
|
||||
for j in range(len(values)):
|
||||
if j <= i:
|
||||
continue
|
||||
value2 = values[j]
|
||||
if GeneralizeToRepresentative._are_inseparable(cells, feature, value1, value2):
|
||||
partition.append(value2)
|
||||
assigned.append(value2)
|
||||
assigned_values.add(value2)
|
||||
partitions.append(partition)
|
||||
# default representative values (computed with no data)
|
||||
category_representatives[feature].append(partition[0]) # random
|
||||
# for 1-hot encoded features, the first encountered feature will get the value 1 and the rest 0
|
||||
if len(partition) > 1 and feature in self.all_one_hot_features:
|
||||
other_features, _ = self._get_other_features_in_encoding(feature, self.feature_slices)
|
||||
assigned = False
|
||||
for other_feature in other_features:
|
||||
if other_feature in assigned_features:
|
||||
category_representatives[feature].append(0)
|
||||
assigned = True
|
||||
break
|
||||
if not assigned:
|
||||
category_representatives[feature].append(1)
|
||||
assigned_features.add(feature)
|
||||
else:
|
||||
category_representatives[feature].append(partition[0]) # random
|
||||
categories[feature] = partitions
|
||||
return categories, category_representatives
|
||||
|
||||
@staticmethod
|
||||
def _get_other_features_in_encoding(feature, feature_slices):
|
||||
for encoded in feature_slices:
|
||||
if feature in encoded:
|
||||
return (list(set(encoded) - set([feature]))), encoded
|
||||
return [], []
|
||||
|
||||
@staticmethod
|
||||
def _calculate_categorical_features_values(cells):
|
||||
categorical_features_values = {}
|
||||
|
|
|
|||
|
|
@ -959,6 +959,10 @@ def test_minimizer_ndarray_one_hot():
|
|||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_minimizer_ndarray_one_hot_gen():
|
||||
|
|
@ -1000,6 +1004,60 @@ def test_minimizer_ndarray_one_hot_gen():
|
|||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_minimizer_ndarray_one_hot_multi():
|
||||
x_train = np.array([[23, 0, 1, 0, 0, 1, 165],
|
||||
[45, 0, 1, 0, 0, 1, 158],
|
||||
[56, 1, 0, 0, 0, 1, 123],
|
||||
[67, 0, 1, 1, 0, 0, 154],
|
||||
[45, 1, 0, 1, 0, 0, 149],
|
||||
[42, 1, 0, 1, 0, 0, 166],
|
||||
[73, 0, 1, 0, 0, 1, 172],
|
||||
[94, 0, 1, 0, 1, 0, 168],
|
||||
[69, 0, 1, 0, 1, 0, 175],
|
||||
[24, 1, 0, 0, 1, 0, 181],
|
||||
[18, 1, 0, 0, 0, 1, 190]])
|
||||
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
predictions = model.predict(x_train)
|
||||
|
||||
features = ['0', '1', '2', '3', '4', '5', '6']
|
||||
QI = [0, 1, 2, 3, 4, 5]
|
||||
QI_slices = [[1, 2], [3, 4, 5]]
|
||||
target_accuracy = 0.2
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations
|
||||
expected_generalizations = {'categories':
|
||||
{'1': [[0, 1]], '2': [[0, 1]], '3': [[0, 1]], '4': [[0, 1]], '5': [[0, 1]]},
|
||||
'category_representatives': {'1': [0], '2': [1], '3': [0], '4': [1], '5': [0]},
|
||||
'range_representatives': {'0': []}, 'ranges': {'0': []}, 'untouched': ['6']}
|
||||
|
||||
compare_generalizations(gener, expected_generalizations)
|
||||
|
||||
check_features(features, expected_generalizations, transformed, x_train)
|
||||
ncp = gen.ncp.transform_score
|
||||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
transformed_slice = transformed[:, QI_slices[1]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_anonymize_pandas_one_hot():
|
||||
|
|
@ -1043,6 +1101,10 @@ def test_anonymize_pandas_one_hot():
|
|||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed.loc[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_keras_model():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue