From f646109e849ee9f54dd1502d1ac273efd28b69dd Mon Sep 17 00:00:00 2001 From: abigailt Date: Sun, 24 Dec 2023 13:13:52 -0500 Subject: [PATCH] Replace values in multi-column 1-hot encoded features instead of appending so that options are narrowed down Signed-off-by: abigailt --- apt/minimization/minimizer.py | 18 ++++++-------- tests/test_minimizer.py | 46 +++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 39521d4..8f2d1b8 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -602,7 +602,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM elif feature in cell['untouched']: continue else: - raise TypeError("feature " + feature + "not found in cell" + cell['id']) + raise TypeError("feature " + str(feature) + " not found in cell " + str(cell['id'])) # Mark as mapped mapped.itemset(index, 1) return True @@ -736,24 +736,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM feature_value = 0 elif range['end'] is None and range['start'] > 0: feature_value = 1 - elif range['start'] is not None and range['end'] is not None: - print(range) - new_cell['categories'][feature].append(feature_value) + new_cell['categories'][feature] = [feature_value] # need to add other columns that represent same 1-hot encoded feature # search for feature group: other_features, encoded = self._get_other_features_in_encoding(feature, self.feature_slices) for other_feature in other_features: - if other_feature not in new_cell['categories'].keys(): - new_cell['categories'][other_feature] = [] if feature_value == 1: - new_cell['categories'][other_feature].append(0) + new_cell['categories'][other_feature] = [0] elif len(encoded) == 2: - new_cell['categories'][other_feature].append(1) - else: - new_cell['categories'][other_feature].append(0) - new_cell['categories'][other_feature].append(1) + new_cell['categories'][other_feature] = [1] + elif (other_feature not in new_cell['categories'].keys() or + len(new_cell['categories'][other_feature]) == 0): + new_cell['categories'][other_feature] = [0, 1] else: if feature in cell['ranges'].keys(): new_cell['ranges'][feature] = cell['ranges'][feature] diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index ca69cf4..99d1a19 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -1060,6 +1060,52 @@ def test_minimizer_ndarray_one_hot_multi(): assert ((np.min(transformed_slice, axis=1) == 0).all()) +def test_minimizer_ndarray_one_hot_multi2(): + x_train = np.array([[0, 0, 1], + [0, 0, 1], + [0, 1, 0], + [0, 1, 0], + [1, 0, 0], + [1, 0, 0]]) + y_train = np.array([1, 1, 2, 2, 0, 0]) + + model = DecisionTreeClassifier() + model.fit(x_train, y_train) + predictions = model.predict(x_train) + + features = ['0', '1', '2'] + QI = [0, 1, 2] + QI_slices = [[0, 1, 2]] + target_accuracy = 0.2 + gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices, + features_to_minimize=QI) + gen.fit(dataset=ArrayDataset(x_train, predictions)) + transformed = gen.transform(dataset=ArrayDataset(x_train)) + gener = gen.generalizations + expected_generalizations = {'categories': + {'1': [[0, 1]], '2': [[0, 1]], '3': [[0, 1]], '4': [[0, 1]], '5': [[0, 1]]}, + 'category_representatives': {'1': [0], '2': [1], '3': [0], '4': [1], '5': [0]}, + 'range_representatives': {'0': []}, 'ranges': {'0': []}, 'untouched': ['6']} + + compare_generalizations(gener, expected_generalizations) + + check_features(features, expected_generalizations, transformed, x_train) + ncp = gen.ncp.transform_score + check_ncp(ncp, expected_generalizations) + + rel_accuracy = model.score(transformed, predictions) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) + transformed_slice = transformed[:, QI_slices[0]] + assert ((np.sum(transformed_slice, axis=1) == 1).all()) + assert ((np.max(transformed_slice, axis=1) == 1).all()) + assert ((np.min(transformed_slice, axis=1) == 0).all()) + transformed_slice = transformed[:, QI_slices[1]] + assert ((np.sum(transformed_slice, axis=1) == 1).all()) + assert ((np.max(transformed_slice, axis=1) == 1).all()) + assert ((np.min(transformed_slice, axis=1) == 0).all()) + + + def test_anonymize_pandas_one_hot(): features = ["age", "gender_M", "gender_F", "height"] x_train = np.array([[23, 0, 1, 165],