mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
Compute generalizations with test data when possible (for computing better representatives).
Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
b48b829a01
commit
c2e0fced03
2 changed files with 50 additions and 24 deletions
|
|
@ -325,7 +325,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
|
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
|
||||||
|
|
||||||
# self._cells currently holds the generalization created from the tree leaves
|
# self._cells currently holds the generalization created from the tree leaves
|
||||||
self._calculate_generalizations()
|
self._calculate_generalizations(X_test)
|
||||||
if generalize_using_transform:
|
if generalize_using_transform:
|
||||||
generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
||||||
else:
|
else:
|
||||||
|
|
@ -355,7 +355,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
|
|
||||||
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
|
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
|
||||||
|
|
||||||
self._calculate_generalizations()
|
self._calculate_generalizations(X_test)
|
||||||
if generalize_using_transform:
|
if generalize_using_transform:
|
||||||
generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells,
|
generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells,
|
||||||
self._cells_by_id)
|
self._cells_by_id)
|
||||||
|
|
@ -385,7 +385,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
if removed_feature is None:
|
if removed_feature is None:
|
||||||
break
|
break
|
||||||
|
|
||||||
self._calculate_generalizations()
|
self._calculate_generalizations(X_test)
|
||||||
if generalize_using_transform:
|
if generalize_using_transform:
|
||||||
generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells,
|
generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells,
|
||||||
self._cells_by_id)
|
self._cells_by_id)
|
||||||
|
|
@ -1084,6 +1084,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self._generalizations['ranges'],
|
self._generalizations['ranges'],
|
||||||
self._generalizations['categories'])
|
self._generalizations['categories'])
|
||||||
# categorical - use most common value
|
# categorical - use most common value
|
||||||
|
old_category_representatives = category_representatives
|
||||||
category_representatives = {}
|
category_representatives = {}
|
||||||
for feature in self._generalizations['categories']:
|
for feature in self._generalizations['categories']:
|
||||||
category_representatives[feature] = []
|
category_representatives[feature] = []
|
||||||
|
|
@ -1092,34 +1093,42 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
# for c_index in range(len(group)):
|
# for c_index in range(len(group)):
|
||||||
# indexes = [i for i, s in enumerate(sample_indexes) if s[feature][g_index] == c_index]
|
# indexes = [i for i, s in enumerate(sample_indexes) if s[feature][g_index] == c_index]
|
||||||
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
|
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
|
||||||
rows = samples[indexes]
|
if indexes:
|
||||||
values = rows[:, feature]
|
rows = samples.iloc[indexes]
|
||||||
category = Counter(values).most_common(1)[0][0]
|
values = rows[feature]
|
||||||
category_representatives[feature].append(group[category])
|
category = Counter(values).most_common(1)[0][0]
|
||||||
# c_count = len([s for s in sample_indexes if s[feature][g_index] == c_index])
|
category_representatives[feature].append(category)
|
||||||
# if c_count > max_count:
|
# c_count = len([s for s in sample_indexes if s[feature][g_index] == c_index])
|
||||||
# max_count = c_count
|
# if c_count > max_count:
|
||||||
# category = c_index
|
# max_count = c_count
|
||||||
# category_representatives[feature].append(group[category])
|
# category = c_index
|
||||||
|
# category_representatives[feature].append(group[category])
|
||||||
|
else:
|
||||||
|
category_representatives[feature].append(old_category_representatives[feature][g_index])
|
||||||
|
|
||||||
# numerical - use actual value closest to mean
|
# numerical - use actual value closest to mean
|
||||||
|
old_range_representatives = range_representatives
|
||||||
range_representatives = {}
|
range_representatives = {}
|
||||||
for feature in self._generalizations['ranges']:
|
for feature in self._generalizations['ranges']:
|
||||||
|
range_representatives[feature] = []
|
||||||
# find the mean value (per feature)
|
# find the mean value (per feature)
|
||||||
for index in range(len(self._generalizations['ranges'][feature])):
|
for index in range(len(self._generalizations['ranges'][feature])):
|
||||||
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == index]
|
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == index]
|
||||||
rows = samples[indexes]
|
if indexes:
|
||||||
values = rows[:, feature]
|
rows = samples.iloc[indexes]
|
||||||
median = np.median(values)
|
values = rows[feature]
|
||||||
min_value = max(values)
|
median = np.median(values)
|
||||||
min_dist = float("inf")
|
min_value = max(values)
|
||||||
for value in values:
|
min_dist = float("inf")
|
||||||
# euclidean distance between two floating point values
|
for value in values:
|
||||||
dist = abs(value - median)
|
# euclidean distance between two floating point values
|
||||||
if dist < min_dist:
|
dist = abs(value - median)
|
||||||
min_dist = dist
|
if dist < min_dist:
|
||||||
min_value = value
|
min_dist = dist
|
||||||
range_representatives[feature].append(min_value)
|
min_value = value
|
||||||
|
range_representatives[feature].append(min_value)
|
||||||
|
else:
|
||||||
|
range_representatives[feature].append(old_range_representatives[feature][index])
|
||||||
self._generalizations['category_representatives'] = category_representatives
|
self._generalizations['category_representatives'] = category_representatives
|
||||||
self._generalizations['range_representatives'] = range_representatives
|
self._generalizations['range_representatives'] = range_representatives
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,24 @@ def test_minimizer_params():
|
||||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||||
model.fit(ArrayDataset(X, y))
|
model.fit(ArrayDataset(X, y))
|
||||||
|
|
||||||
|
expected_generalizations = {'categories': {}, 'category_representatives': {},
|
||||||
|
'range_representatives': {'age': [38, 0.5, 40], 'height': [170, 0.5, 172]},
|
||||||
|
'ranges': {'age': [38, 39], 'height': [170, 171]}, 'untouched': []}
|
||||||
|
|
||||||
gen = GeneralizeToRepresentative(model, cells=cells)
|
gen = GeneralizeToRepresentative(model, cells=cells)
|
||||||
|
gener = gen.generalizations
|
||||||
|
for key in expected_generalizations['ranges']:
|
||||||
|
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
|
||||||
|
for key in expected_generalizations['categories']:
|
||||||
|
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
|
||||||
|
== set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||||
|
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
|
||||||
|
for key in expected_generalizations['range_representatives']:
|
||||||
|
assert (set(expected_generalizations['range_representatives'][key]) == set(gener['range_representatives'][key]))
|
||||||
|
for key in expected_generalizations['category_representatives']:
|
||||||
|
assert (set([frozenset(sl) for sl in expected_generalizations['category_representatives'][key]])
|
||||||
|
== set([frozenset(sl) for sl in gener['category_representatives'][key]]))
|
||||||
|
|
||||||
gen.fit()
|
gen.fit()
|
||||||
gen.transform(dataset=ArrayDataset(X, features_names=features))
|
gen.transform(dataset=ArrayDataset(X, features_names=features))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue