From 2a657388af0655440548e7e8fc09bf698cb82943 Mon Sep 17 00:00:00 2001
From: abigailt <abigailt@il.ibm.com>
Date: Wed, 15 Nov 2023 08:21:40 -0500
Subject: [PATCH] Initial version with first working test

Signed-off-by: abigailt <abigailt@il.ibm.com>
---
 apt/minimization/minimizer.py | 81 +++++++++++++++++++++++++++++++----
 tests/test_minimizer.py       | 80 +++++++++++++++++++++++++++++++++-
 2 files changed, 150 insertions(+), 11 deletions(-)

diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py
index 0e2e3b8..53ef7c5 100644
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@@ -56,9 +56,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
     :param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
                     features). If not provided, the data will be fed as is directly to the estimator.
     :type encoder: sklearn OrdinalEncoder or OneHotEncoder
-    :type categorical_features: list of strings, optional
-    :param features_to_minimize: The features to be minimized.
+    :type categorical_features: list of strings or integers, optional
+    :param features_to_minimize: The features to be minimized. If not provided, all features will be minimized.
     :type features_to_minimize: list of strings or int, optional
+    :param feature_slices: If some of the features to be minimized represent 1-hot encoded features that need to remain
+                           consistent after minimization, provide a list containing the list of column names
+                           or indexes that represent a single feature.
+    :type feature_slices: list of lists of strings or integers, optional
     :param train_only_features_to_minimize: Whether to train the tree just on the ``features_to_minimize`` or on all
                                             features. Default is only on ``features_to_minimize``.
     :type train_only_features_to_minimize: boolean, optional
@@ -79,6 +83,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                  categorical_features: Optional[Union[np.ndarray, list]] = None,
                  encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
                  features_to_minimize: Optional[Union[np.ndarray, list]] = None,
+                 feature_slices: Optional[list] = None,
                  train_only_features_to_minimize: Optional[bool] = True,
                  is_regression: Optional[bool] = False,
                  generalize_using_transform: bool = True):
@@ -97,6 +102,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
         if categorical_features:
             self.categorical_features = categorical_features
         self.features_to_minimize = features_to_minimize
+        self.feature_slices = feature_slices
+        if self.feature_slices:
+            self.all_one_hot_features = set([str(feature) for encoded in self.feature_slices for feature in encoded])
+        else:
+            self.all_one_hot_features = set()
         self.train_only_features_to_minimize = train_only_features_to_minimize
         self.is_regression = is_regression
         self.encoder = encoder
@@ -121,6 +131,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
         ret['target_accuracy'] = self.target_accuracy
         ret['categorical_features'] = self.categorical_features
         ret['features_to_minimize'] = self.features_to_minimize
+        ret['feature_slices'] = self.feature_slices
         ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
         ret['is_regression'] = self.is_regression
         ret['estimator'] = self.estimator
@@ -151,6 +162,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
             self.categorical_features = params['categorical_features']
         if 'features_to_minimize' in params:
             self.features_to_minimize = params['features_to_minimize']
+        if 'feature_slices' in params:
+            self.feature_slices = params['feature_slices']
         if 'train_only_features_to_minimize' in params:
             self.train_only_features_to_minimize = params['train_only_features_to_minimize']
         if 'is_regression' in params:
@@ -259,6 +272,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
             self.features_to_minimize = [str(i) for i in self.features_to_minimize]
             if not all(elem in self._features for elem in self.features_to_minimize):
                 raise ValueError('features to minimize should be a subset of features names')
+            if self.feature_slices:
+                temp_list = []
+                for slice in self.feature_slices:
+                    new_slice = [str(i) for i in slice]
+                    if not all(elem in self._features for elem in new_slice):
+                        raise ValueError('features in slices should be a subset of features names')
+                    temp_list.append(new_slice)
+                self.feature_slices = temp_list
             x_qi = x.loc[:, self.features_to_minimize]
 
             # divide dataset into train and test
@@ -703,6 +724,36 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                             # categorical feature can not have this value
                             if categorical_value in new_cell['categories'][categorical_feature]:
                                 new_cell['categories'][categorical_feature].remove(categorical_value)
+                # features that were already one-hot encoded. Legal values should be 0 or 1
+                elif feature in self.all_one_hot_features:
+                    if feature not in new_cell['categories'].keys():
+                        new_cell['categories'][feature] = []
+                    if feature in cell['ranges']:
+                        range = cell['ranges'][feature]
+                        if range['start'] is None and range['end'] < 1:
+                            feature_value = 0
+                        elif range['end'] is None and range['start'] > 0:
+                            feature_value = 1
+                        elif range['start'] is not None and range['end'] is not None:
+                            print(range)
+                        new_cell['categories'][feature].append(feature_value)
+
+                        # need to add other columns that represent same 1-hot encoded feature
+
+                        # search for feature group:
+                        for encoded in self.feature_slices:
+                            if feature in encoded:
+                                other_features = list(set(encoded) - set([feature]))
+                                for other_feature in other_features:
+                                    if other_feature not in new_cell['categories'].keys():
+                                        new_cell['categories'][other_feature] = []
+                                    if feature_value == 1:
+                                        new_cell['categories'][other_feature].append(0)
+                                    elif len(encoded) == 2:
+                                        new_cell['categories'][other_feature].append(1)
+                                    else:
+                                        new_cell['categories'][other_feature].append(0)
+                                        new_cell['categories'][other_feature].append(1)
                 else:
                     if feature in cell['ranges'].keys():
                         new_cell['ranges'][feature] = cell['ranges'][feature]
@@ -813,6 +864,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                     min_dist = dist
                     min = i
                 i = i + 1
+            # since this is an actual row from the data, correct one-hot encoding is already guaranteed
             row = match_rows.iloc[min]
             for feature in cell['ranges'].keys():
                 cell['representative'][feature] = row[feature]
@@ -861,6 +913,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
         new_dtypes = {}
         for t in dtypes.keys():
             new_dtypes[t] = pd.Series(dtype=dtypes[t].name)
+            dtypes[t] = dtypes[t].name
         representatives = pd.DataFrame(new_dtypes)  # empty except for columns
         original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
 
@@ -891,6 +944,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                 replace = pd.DataFrame(replace, indexes, columns=self._features)
                 original_data_generalized.loc[indexes, representatives.columns.tolist()] = replace
 
+        original_data_generalized = original_data_generalized.astype(dtype=dtypes)
         return original_data_generalized
 
     def _generalize(self, data, data_prepared, nodes):
@@ -1024,7 +1078,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                         current_accuracy):
         new_cells = copy.deepcopy(self.cells)
         cells_by_id = copy.deepcopy(self._cells_by_id)
-        GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
+        self._remove_feature_from_cells(new_cells, cells_by_id, feature)
         generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
                                                  cells_by_id)
         accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder)
@@ -1229,16 +1283,25 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
         untouched = untouched.intersection(*untouched_lists)
         return list(untouched)
 
+    def _remove_feature_from_cells(self, cells, cells_by_id, feature):
+        if feature in self.all_one_hot_features:
+            for encoded in self.feature_slices:
+                if feature in encoded:
+                    self._remove_feature_from_cells_internal(cells, cells_by_id, encoded)
+        else:
+            self._remove_feature_from_cells_internal(cells, cells_by_id, [feature])
+
     @staticmethod
-    def _remove_feature_from_cells(cells, cells_by_id, feature):
+    def _remove_feature_from_cells_internal(cells, cells_by_id, features):
         for cell in cells:
             if 'untouched' not in cell:
                 cell['untouched'] = []
-            if feature in cell['ranges'].keys():
-                del cell['ranges'][feature]
-            elif feature in cell['categories'].keys():
-                del cell['categories'][feature]
-            cell['untouched'].append(feature)
+            for feature in features:
+                if feature in cell['ranges'].keys():
+                    del cell['ranges'][feature]
+                elif feature in cell['categories'].keys():
+                    del cell['categories'][feature]
+                cell['untouched'].append(feature)
             cells_by_id[cell['id']] = cell.copy()
 
     @staticmethod
diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py
index ca34fbd..2cc141e 100644
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@@ -200,9 +200,9 @@ def check_features(features, expected_generalizations, transformed, x, pandas=Fa
             if features[i] in modified_features:
                 indexes.append(i)
         if len(indexes) != transformed.shape[1]:
-            assert ((np.delete(transformed, indexes, axis=1) == np.delete(x, indexes, axis=1)).all())
+            assert (np.array_equal(np.delete(transformed, indexes, axis=1), np.delete(x, indexes, axis=1)))
         if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
-            assert (((transformed[indexes]) != (x[indexes])).any())
+            assert (not np.array_equal(transformed[:, indexes], x[:, indexes]))
 
 
 def check_ncp(ncp, expected_generalizations):
@@ -920,6 +920,82 @@ def test_BaseEstimator_regression(diabetes_dataset):
     assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
 
 
+def test_minimizer_ndarray_one_hot():
+    x_train = np.array([[23, 0, 1, 165],
+                        [45, 0, 1, 158],
+                        [56, 1, 0, 123],
+                        [67, 0, 1, 154],
+                        [45, 1, 0, 149],
+                        [42, 1, 0, 166],
+                        [73, 0, 1, 172],
+                        [94, 0, 1, 168],
+                        [69, 0, 1, 175],
+                        [24, 1, 0, 181],
+                        [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    predictions = model.predict(x_train)
+
+    features = ['0', '1', '2', '3']
+    QI = [0, 1, 2]
+    QI_slices = [[1, 2]]
+    target_accuracy = 0.7
+    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
+                                     features_to_minimize=QI)
+    gen.fit(dataset=ArrayDataset(x_train, predictions))
+    transformed = gen.transform(dataset=ArrayDataset(x_train))
+    gener = gen.generalizations
+    expected_generalizations = {'categories': {}, 'category_representatives': {},
+                                'range_representatives': {'0': [34.5]},
+                                'ranges': {'0': [34.5]}, 'untouched': ['3', '1', '2']}
+
+    compare_generalizations(gener, expected_generalizations)
+
+    check_features(features, expected_generalizations, transformed, x_train)
+    ncp = gen.ncp.transform_score
+    check_ncp(ncp, expected_generalizations)
+
+    rel_accuracy = model.score(transformed, predictions)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
+
+
+def test_anonymize_pandas_one_hot():
+    feature_names = ["age", "gender_M", "gender_F", "height"]
+    x_train = np.array([[23, 0, 1, 165],
+                        [45, 0, 1, 158],
+                        [56, 1, 0, 123],
+                        [67, 0, 1, 154],
+                        [45, 1, 0, 149],
+                        [42, 1, 0, 166],
+                        [73, 0, 1, 172],
+                        [94, 0, 1, 168],
+                        [69, 0, 1, 175],
+                        [24, 1, 0, 181],
+                        [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+    x_train = pd.DataFrame(x_train, columns=feature_names)
+    y_train = pd.Series(y_train)
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    pred = model.predict(x_train)
+
+    k = 10
+    QI = ["age", "gender_M", "gender_F"]
+    QI_slices = [["gender_M", "gender_F"]]
+    anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
+    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
+    assert (anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
+    assert (anon.loc[:, QI].value_counts().min() >= k)
+    np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
+    anonymized_slice = anon.loc[:, QI_slices[0]]
+    assert ((np.sum(anonymized_slice, axis=1) == 1).all())
+    assert ((np.max(anonymized_slice, axis=1) == 1).all())
+    assert ((np.min(anonymized_slice, axis=1) == 0).all())
+
+
 def test_keras_model():
     (x, y), (x_test, y_test) = get_iris_dataset_np()