More tests and fixes. Make sure representative values in generalizations for 1-hot encoded features are consistent.

Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
abigailt 2023-11-19 11:02:21 -05:00
parent 904462a6a8
commit c122fc7387
2 changed files with 126 additions and 32 deletions

View file

@ -96,8 +96,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
self.target_accuracy = target_accuracy
self.cells = cells
if cells:
self._calculate_generalizations()
self.categorical_features = []
if categorical_features:
self.categorical_features = categorical_features
@ -117,6 +115,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._dt = None
self._features = None
self._level = 0
if cells:
self._calculate_generalizations()
def get_params(self, deep=True):
"""
@ -741,19 +741,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# need to add other columns that represent same 1-hot encoded feature
# search for feature group:
for encoded in self.feature_slices:
if feature in encoded:
other_features = list(set(encoded) - set([feature]))
for other_feature in other_features:
if other_feature not in new_cell['categories'].keys():
new_cell['categories'][other_feature] = []
if feature_value == 1:
new_cell['categories'][other_feature].append(0)
elif len(encoded) == 2:
new_cell['categories'][other_feature].append(1)
else:
new_cell['categories'][other_feature].append(0)
new_cell['categories'][other_feature].append(1)
other_features, encoded = self._get_other_features_in_encoding(feature, self.feature_slices)
for other_feature in other_features:
if other_feature not in new_cell['categories'].keys():
new_cell['categories'][other_feature] = []
if feature_value == 1:
new_cell['categories'][other_feature].append(0)
elif len(encoded) == 2:
new_cell['categories'][other_feature].append(1)
else:
new_cell['categories'][other_feature].append(0)
new_cell['categories'][other_feature].append(1)
else:
if feature in cell['ranges'].keys():
new_cell['ranges'][feature] = cell['ranges'][feature]
@ -1104,17 +1102,31 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# categorical - use most common value
old_category_representatives = category_representatives
category_representatives = {}
done = set()
for feature in self._generalizations['categories']:
category_representatives[feature] = []
for g_index, group in enumerate(self._generalizations['categories'][feature]):
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
if indexes:
rows = samples.iloc[indexes]
values = rows[feature]
category = Counter(values).most_common(1)[0][0]
category_representatives[feature].append(category)
else:
category_representatives[feature].append(old_category_representatives[feature][g_index])
if feature not in done:
category_representatives[feature] = []
for g_index, group in enumerate(self._generalizations['categories'][feature]):
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
if indexes:
rows = samples.iloc[indexes]
if feature in self.all_one_hot_features:
other_features, encoded = self._get_other_features_in_encoding(feature,
self.feature_slices)
values = rows.loc[:, encoded].to_numpy()
unique_rows, counts = np.unique(values, axis=0, return_counts=True)
rep = unique_rows[np.argmax(counts)]
for i, e in enumerate(encoded):
done.add(e)
if e not in category_representatives.keys():
category_representatives[e] = []
category_representatives[e].append(rep[i])
else:
values = rows[feature]
category = Counter(values).most_common(1)[0][0]
category_representatives[feature].append(category)
else:
category_representatives[feature].append(old_category_representatives[feature][g_index])
# numerical - use actual value closest to mean
old_range_representatives = range_representatives
@ -1223,35 +1235,55 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
range_representatives[feature].append(prev_value + 1)
return ranges, range_representatives
@staticmethod
def _calculate_categories(cells):
def _calculate_categories(self, cells):
categories = {}
category_representatives = {}
categorical_features_values = GeneralizeToRepresentative._calculate_categorical_features_values(cells)
assigned_features = set()
for feature in categorical_features_values.keys():
partitions = []
category_representatives[feature] = []
values = categorical_features_values[feature]
assigned = []
assigned_values = set()
for i in range(len(values)):
value1 = values[i]
if value1 in assigned:
if value1 in assigned_values:
continue
partition = [value1]
assigned.append(value1)
assigned_values.add(value1)
for j in range(len(values)):
if j <= i:
continue
value2 = values[j]
if GeneralizeToRepresentative._are_inseparable(cells, feature, value1, value2):
partition.append(value2)
assigned.append(value2)
assigned_values.add(value2)
partitions.append(partition)
# default representative values (computed with no data)
category_representatives[feature].append(partition[0]) # random
# for 1-hot encoded features, the first encountered feature will get the value 1 and the rest 0
if len(partition) > 1 and feature in self.all_one_hot_features:
other_features, _ = self._get_other_features_in_encoding(feature, self.feature_slices)
assigned = False
for other_feature in other_features:
if other_feature in assigned_features:
category_representatives[feature].append(0)
assigned = True
break
if not assigned:
category_representatives[feature].append(1)
assigned_features.add(feature)
else:
category_representatives[feature].append(partition[0]) # random
categories[feature] = partitions
return categories, category_representatives
@staticmethod
def _get_other_features_in_encoding(feature, feature_slices):
for encoded in feature_slices:
if feature in encoded:
return (list(set(encoded) - set([feature]))), encoded
return [], []
@staticmethod
def _calculate_categorical_features_values(cells):
categorical_features_values = {}

View file

@ -959,6 +959,10 @@ def test_minimizer_ndarray_one_hot():
rel_accuracy = model.score(transformed, predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
transformed_slice = transformed[:, QI_slices[0]]
assert ((np.sum(transformed_slice, axis=1) == 1).all())
assert ((np.max(transformed_slice, axis=1) == 1).all())
assert ((np.min(transformed_slice, axis=1) == 0).all())
def test_minimizer_ndarray_one_hot_gen():
@ -1000,6 +1004,60 @@ def test_minimizer_ndarray_one_hot_gen():
rel_accuracy = model.score(transformed, predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
transformed_slice = transformed[:, QI_slices[0]]
assert ((np.sum(transformed_slice, axis=1) == 1).all())
assert ((np.max(transformed_slice, axis=1) == 1).all())
assert ((np.min(transformed_slice, axis=1) == 0).all())
def test_minimizer_ndarray_one_hot_multi():
x_train = np.array([[23, 0, 1, 0, 0, 1, 165],
[45, 0, 1, 0, 0, 1, 158],
[56, 1, 0, 0, 0, 1, 123],
[67, 0, 1, 1, 0, 0, 154],
[45, 1, 0, 1, 0, 0, 149],
[42, 1, 0, 1, 0, 0, 166],
[73, 0, 1, 0, 0, 1, 172],
[94, 0, 1, 0, 1, 0, 168],
[69, 0, 1, 0, 1, 0, 175],
[24, 1, 0, 0, 1, 0, 181],
[18, 1, 0, 0, 0, 1, 190]])
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
predictions = model.predict(x_train)
features = ['0', '1', '2', '3', '4', '5', '6']
QI = [0, 1, 2, 3, 4, 5]
QI_slices = [[1, 2], [3, 4, 5]]
target_accuracy = 0.2
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions))
transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations
expected_generalizations = {'categories':
{'1': [[0, 1]], '2': [[0, 1]], '3': [[0, 1]], '4': [[0, 1]], '5': [[0, 1]]},
'category_representatives': {'1': [0], '2': [1], '3': [0], '4': [1], '5': [0]},
'range_representatives': {'0': []}, 'ranges': {'0': []}, 'untouched': ['6']}
compare_generalizations(gener, expected_generalizations)
check_features(features, expected_generalizations, transformed, x_train)
ncp = gen.ncp.transform_score
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(transformed, predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
transformed_slice = transformed[:, QI_slices[0]]
assert ((np.sum(transformed_slice, axis=1) == 1).all())
assert ((np.max(transformed_slice, axis=1) == 1).all())
assert ((np.min(transformed_slice, axis=1) == 0).all())
transformed_slice = transformed[:, QI_slices[1]]
assert ((np.sum(transformed_slice, axis=1) == 1).all())
assert ((np.max(transformed_slice, axis=1) == 1).all())
assert ((np.min(transformed_slice, axis=1) == 0).all())
def test_anonymize_pandas_one_hot():
@ -1043,6 +1101,10 @@ def test_anonymize_pandas_one_hot():
rel_accuracy = model.score(transformed, predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
transformed_slice = transformed.loc[:, QI_slices[0]]
assert ((np.sum(transformed_slice, axis=1) == 1).all())
assert ((np.max(transformed_slice, axis=1) == 1).all())
assert ((np.min(transformed_slice, axis=1) == 0).all())
def test_keras_model():