mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
Replace values in multi-column 1-hot encoded features instead of appending so that options are narrowed down
Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
0e01e19e0c
commit
f646109e84
2 changed files with 53 additions and 11 deletions
|
|
@ -602,7 +602,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
elif feature in cell['untouched']:
|
||||
continue
|
||||
else:
|
||||
raise TypeError("feature " + feature + "not found in cell" + cell['id'])
|
||||
raise TypeError("feature " + str(feature) + " not found in cell " + str(cell['id']))
|
||||
# Mark as mapped
|
||||
mapped.itemset(index, 1)
|
||||
return True
|
||||
|
|
@ -736,24 +736,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
feature_value = 0
|
||||
elif range['end'] is None and range['start'] > 0:
|
||||
feature_value = 1
|
||||
elif range['start'] is not None and range['end'] is not None:
|
||||
print(range)
|
||||
new_cell['categories'][feature].append(feature_value)
|
||||
new_cell['categories'][feature] = [feature_value]
|
||||
|
||||
# need to add other columns that represent same 1-hot encoded feature
|
||||
|
||||
# search for feature group:
|
||||
other_features, encoded = self._get_other_features_in_encoding(feature, self.feature_slices)
|
||||
for other_feature in other_features:
|
||||
if other_feature not in new_cell['categories'].keys():
|
||||
new_cell['categories'][other_feature] = []
|
||||
if feature_value == 1:
|
||||
new_cell['categories'][other_feature].append(0)
|
||||
new_cell['categories'][other_feature] = [0]
|
||||
elif len(encoded) == 2:
|
||||
new_cell['categories'][other_feature].append(1)
|
||||
else:
|
||||
new_cell['categories'][other_feature].append(0)
|
||||
new_cell['categories'][other_feature].append(1)
|
||||
new_cell['categories'][other_feature] = [1]
|
||||
elif (other_feature not in new_cell['categories'].keys() or
|
||||
len(new_cell['categories'][other_feature]) == 0):
|
||||
new_cell['categories'][other_feature] = [0, 1]
|
||||
else:
|
||||
if feature in cell['ranges'].keys():
|
||||
new_cell['ranges'][feature] = cell['ranges'][feature]
|
||||
|
|
|
|||
|
|
@ -1060,6 +1060,52 @@ def test_minimizer_ndarray_one_hot_multi():
|
|||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_minimizer_ndarray_one_hot_multi2():
|
||||
x_train = np.array([[0, 0, 1],
|
||||
[0, 0, 1],
|
||||
[0, 1, 0],
|
||||
[0, 1, 0],
|
||||
[1, 0, 0],
|
||||
[1, 0, 0]])
|
||||
y_train = np.array([1, 1, 2, 2, 0, 0])
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
predictions = model.predict(x_train)
|
||||
|
||||
features = ['0', '1', '2']
|
||||
QI = [0, 1, 2]
|
||||
QI_slices = [[0, 1, 2]]
|
||||
target_accuracy = 0.2
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations
|
||||
expected_generalizations = {'categories':
|
||||
{'1': [[0, 1]], '2': [[0, 1]], '3': [[0, 1]], '4': [[0, 1]], '5': [[0, 1]]},
|
||||
'category_representatives': {'1': [0], '2': [1], '3': [0], '4': [1], '5': [0]},
|
||||
'range_representatives': {'0': []}, 'ranges': {'0': []}, 'untouched': ['6']}
|
||||
|
||||
compare_generalizations(gener, expected_generalizations)
|
||||
|
||||
check_features(features, expected_generalizations, transformed, x_train)
|
||||
ncp = gen.ncp.transform_score
|
||||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
transformed_slice = transformed[:, QI_slices[0]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
transformed_slice = transformed[:, QI_slices[1]]
|
||||
assert ((np.sum(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.max(transformed_slice, axis=1) == 1).all())
|
||||
assert ((np.min(transformed_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
|
||||
def test_anonymize_pandas_one_hot():
|
||||
features = ["age", "gender_M", "gender_F", "height"]
|
||||
x_train = np.array([[23, 0, 1, 165],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue