Support for one-hot encoded features in minimization (#87)

* Initial version with first working test * Make sure representative values in generalizations for 1-hot encoded features are consistent. * Updated notebooks for one-hot encoded data * Review comments Signed-off-by: abigailt <abigailt@il.ibm.com>
2026-04-26 05:16:22 +02:00 · 2023-12-24 18:18:18 -05:00 · 2023-12-24 18:18:18 -05:00 · 6d81cd8ed4
commit 6d81cd8ed4
parent 5dce961092
4 changed files with 26703 additions and 48 deletions
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -56,9 +56,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    :param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
                    features). If not provided, the data will be fed as is directly to the estimator.
    :type encoder: sklearn OrdinalEncoder or OneHotEncoder
-    :type categorical_features: list of strings, optional
-    :param features_to_minimize: The features to be minimized.
+    :type categorical_features: list of strings or integers, optional
+    :param features_to_minimize: The features to be minimized. If not provided, all features will be minimized.
    :type features_to_minimize: list of strings or int, optional
+    :param feature_slices: If some of the features to be minimized represent 1-hot encoded features that need to remain
+                           consistent after minimization, provide a list containing the list of column names
+                           or indexes that represent a single feature.
+    :type feature_slices: list of lists of strings or integers, optional
    :param train_only_features_to_minimize: Whether to train the tree just on the ``features_to_minimize`` or on all
                                            features. Default is only on ``features_to_minimize``.
    :type train_only_features_to_minimize: boolean, optional
@ -79,6 +83,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                 categorical_features: Optional[Union[np.ndarray, list]] = None,
                 encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
                 features_to_minimize: Optional[Union[np.ndarray, list]] = None,
+                 feature_slices: Optional[list] = None,
                 train_only_features_to_minimize: Optional[bool] = True,
                 is_regression: Optional[bool] = False,
                 generalize_using_transform: bool = True):
@ -91,12 +96,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
        self.target_accuracy = target_accuracy
        self.cells = cells
-        if cells:
-            self._calculate_generalizations()
        self.categorical_features = []
        if categorical_features:
            self.categorical_features = categorical_features
        self.features_to_minimize = features_to_minimize
+        self.feature_slices = feature_slices
+        if self.feature_slices:
+            self.all_one_hot_features = {str(feature) for encoded in self.feature_slices for feature in encoded}
+        else:
+            self.all_one_hot_features = set()
        self.train_only_features_to_minimize = train_only_features_to_minimize
        self.is_regression = is_regression
        self.encoder = encoder
@ -107,6 +115,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        self._dt = None
        self._features = None
        self._level = 0
+        if cells:
+            self._calculate_generalizations()

    def get_params(self, deep=True):
        """
@ -121,6 +131,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        ret['target_accuracy'] = self.target_accuracy
        ret['categorical_features'] = self.categorical_features
        ret['features_to_minimize'] = self.features_to_minimize
+        ret['feature_slices'] = self.feature_slices
        ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
        ret['is_regression'] = self.is_regression
        ret['estimator'] = self.estimator
@ -151,6 +162,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            self.categorical_features = params['categorical_features']
        if 'features_to_minimize' in params:
            self.features_to_minimize = params['features_to_minimize']
+        if 'feature_slices' in params:
+            self.feature_slices = params['feature_slices']
        if 'train_only_features_to_minimize' in params:
            self.train_only_features_to_minimize = params['train_only_features_to_minimize']
        if 'is_regression' in params:
@ -259,6 +272,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            self.features_to_minimize = [str(i) for i in self.features_to_minimize]
            if not all(elem in self._features for elem in self.features_to_minimize):
                raise ValueError('features to minimize should be a subset of features names')
+            if self.feature_slices:
+                temp_list = []
+                for slice in self.feature_slices:
+                    new_slice = [str(i) for i in slice]
+                    if not all(elem in self._features for elem in new_slice):
+                        raise ValueError('features in slices should be a subset of features names')
+                    temp_list.append(new_slice)
+                self.feature_slices = temp_list
            x_qi = x.loc[:, self.features_to_minimize]

            # divide dataset into train and test
@ -325,8 +346,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            # if accuracy above threshold, improve generalization
            if accuracy > self.target_accuracy:
                print('Improving generalizations')
-                self._level = 1
+                self._level = 0
                while accuracy > self.target_accuracy:
+                    self._level += 1
                    cells_previous_iter = self.cells
                    generalization_prev_iter = self._generalizations
                    cells_by_id_prev = self._cells_by_id
@ -352,7 +374,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                        break
                    else:
                        print('Pruned tree to level: %d, new relative accuracy: %f' % (self._level, accuracy))
-                        self._level += 1

            # if accuracy below threshold, improve accuracy by removing features from generalization
            elif accuracy < self.target_accuracy:
@ -375,6 +396,16 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            x_test_dataset = ArrayDataset(x_test, features_names=self._features)
            self._ncp_scores.fit_score = self.calculate_ncp(x_test_dataset)
            self._ncp_scores.generalizations_score = self.calculate_ncp(x_test_dataset)
+        else:
+            print('No fitting was performed as some information was missing')
+            if not self.estimator:
+                print('No estimator provided')
+            elif not dataset:
+                print('No data provided')
+            elif dataset.get_samples() is None:
+                print('No samples provided')
+            elif dataset.get_labels() is None:
+                print('No labels provided')

        # Return the transformer
        return self
@ -579,7 +610,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            elif feature in cell['untouched']:
                continue
            else:
-                raise TypeError("feature " + feature + "not found in cell" + cell['id'])
+                raise TypeError("feature " + str(feature) + " not found in cell " + str(cell['id']))
        # Mark as mapped
        mapped.itemset(index, 1)
        return True
@ -703,6 +734,32 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                            # categorical feature can not have this value
                            if categorical_value in new_cell['categories'][categorical_feature]:
                                new_cell['categories'][categorical_feature].remove(categorical_value)
+                # features that were already one-hot encoded. Legal values should be 0 or 1
+                elif feature in self.all_one_hot_features:
+                    if feature not in new_cell['categories'].keys():
+                        new_cell['categories'][feature] = []
+                    if feature in cell['ranges']:
+                        range = cell['ranges'][feature]
+                        if range['start'] is None and range['end'] < 1:
+                            feature_value = 0
+                        elif range['end'] is None and range['start'] > 0:
+                            feature_value = 1
+                        else:
+                            raise ValueError('Illegal range for 1-hot encoded feature')
+                        new_cell['categories'][feature] = [feature_value]
+
+                        # need to add other columns that represent same 1-hot encoded feature
+
+                        # search for feature group:
+                        other_features, encoded = self._get_other_features_in_encoding(feature, self.feature_slices)
+                        for other_feature in other_features:
+                            if feature_value == 1:
+                                new_cell['categories'][other_feature] = [0]
+                            elif len(encoded) == 2:
+                                new_cell['categories'][other_feature] = [1]
+                            elif (other_feature not in new_cell['categories'].keys()
+                                  or len(new_cell['categories'][other_feature]) == 0):
+                                new_cell['categories'][other_feature] = [0, 1]
                else:
                    if feature in cell['ranges'].keys():
                        new_cell['ranges'][feature] = cell['ranges'][feature]
@ -813,6 +870,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    min_dist = dist
                    min = i
                i = i + 1
+            # since this is an actual row from the data, correct one-hot encoding is already guaranteed
            row = match_rows.iloc[min]
            for feature in cell['ranges'].keys():
                cell['representative'][feature] = row[feature]
@ -861,6 +919,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        new_dtypes = {}
        for t in dtypes.keys():
            new_dtypes[t] = pd.Series(dtype=dtypes[t].name)
+            dtypes[t] = dtypes[t].name
        representatives = pd.DataFrame(new_dtypes)  # empty except for columns
        original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)

@ -891,6 +950,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                replace = pd.DataFrame(replace, indexes, columns=self._features)
                original_data_generalized.loc[indexes, representatives.columns.tolist()] = replace

+        original_data_generalized = original_data_generalized.astype(dtype=dtypes)
        return original_data_generalized

    def _generalize(self, data, data_prepared, nodes):
@ -1024,7 +1084,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                        current_accuracy):
        new_cells = copy.deepcopy(self.cells)
        cells_by_id = copy.deepcopy(self._cells_by_id)
-        GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
+        self._remove_feature_from_cells(new_cells, cells_by_id, feature)
        generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
                                                 cells_by_id)
        accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder)
@ -1050,17 +1110,31 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            # categorical - use most common value
            old_category_representatives = category_representatives
            category_representatives = {}
+            done = set()
            for feature in self._generalizations['categories']:
-                category_representatives[feature] = []
-                for g_index, group in enumerate(self._generalizations['categories'][feature]):
-                    indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
-                    if indexes:
-                        rows = samples.iloc[indexes]
-                        values = rows[feature]
-                        category = Counter(values).most_common(1)[0][0]
-                        category_representatives[feature].append(category)
-                    else:
-                        category_representatives[feature].append(old_category_representatives[feature][g_index])
+                if feature not in done:
+                    category_representatives[feature] = []
+                    for g_index, group in enumerate(self._generalizations['categories'][feature]):
+                        indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
+                        if indexes:
+                            rows = samples.iloc[indexes]
+                            if feature in self.all_one_hot_features:
+                                other_features, encoded = self._get_other_features_in_encoding(feature,
+                                                                                               self.feature_slices)
+                                values = rows.loc[:, encoded].to_numpy()
+                                unique_rows, counts = np.unique(values, axis=0, return_counts=True)
+                                rep = unique_rows[np.argmax(counts)]
+                                for i, e in enumerate(encoded):
+                                    done.add(e)
+                                    if e not in category_representatives.keys():
+                                        category_representatives[e] = []
+                                    category_representatives[e].append(rep[i])
+                            else:
+                                values = rows[feature]
+                                category = Counter(values).most_common(1)[0][0]
+                                category_representatives[feature].append(category)
+                        else:
+                            category_representatives[feature].append(old_category_representatives[feature][g_index])

            # numerical - use actual value closest to mean
            old_range_representatives = range_representatives
@ -1169,35 +1243,55 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                range_representatives[feature].append(prev_value + 1)
        return ranges, range_representatives

-    @staticmethod
-    def _calculate_categories(cells):
+    def _calculate_categories(self, cells):
        categories = {}
        category_representatives = {}
        categorical_features_values = GeneralizeToRepresentative._calculate_categorical_features_values(cells)
+        assigned_features = set()
        for feature in categorical_features_values.keys():
            partitions = []
            category_representatives[feature] = []
            values = categorical_features_values[feature]
-            assigned = []
+            assigned_values = set()
            for i in range(len(values)):
                value1 = values[i]
-                if value1 in assigned:
+                if value1 in assigned_values:
                    continue
                partition = [value1]
-                assigned.append(value1)
+                assigned_values.add(value1)
                for j in range(len(values)):
                    if j <= i:
                        continue
                    value2 = values[j]
                    if GeneralizeToRepresentative._are_inseparable(cells, feature, value1, value2):
                        partition.append(value2)
-                        assigned.append(value2)
+                        assigned_values.add(value2)
                partitions.append(partition)
                # default representative values (computed with no data)
-                category_representatives[feature].append(partition[0])  # random
+                # for 1-hot encoded features, the first encountered feature will get the value 1 and the rest 0
+                if len(partition) > 1 and feature in self.all_one_hot_features:
+                    other_features, _ = self._get_other_features_in_encoding(feature, self.feature_slices)
+                    assigned = False
+                    for other_feature in other_features:
+                        if other_feature in assigned_features:
+                            category_representatives[feature].append(0)
+                            assigned = True
+                            break
+                    if not assigned:
+                        category_representatives[feature].append(1)
+                    assigned_features.add(feature)
+                else:
+                    category_representatives[feature].append(partition[0])  # random
            categories[feature] = partitions
        return categories, category_representatives

+    @staticmethod
+    def _get_other_features_in_encoding(feature, feature_slices):
+        for encoded in feature_slices:
+            if feature in encoded:
+                return (list(set(encoded) - {feature})), encoded
+        return [], []
+
    @staticmethod
    def _calculate_categorical_features_values(cells):
        categorical_features_values = {}
@ -1229,16 +1323,25 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        untouched = untouched.intersection(*untouched_lists)
        return list(untouched)

+    def _remove_feature_from_cells(self, cells, cells_by_id, feature):
+        if feature in self.all_one_hot_features:
+            for encoded in self.feature_slices:
+                if feature in encoded:
+                    self._remove_feature_from_cells_internal(cells, cells_by_id, encoded)
+        else:
+            self._remove_feature_from_cells_internal(cells, cells_by_id, [feature])
+
    @staticmethod
-    def _remove_feature_from_cells(cells, cells_by_id, feature):
+    def _remove_feature_from_cells_internal(cells, cells_by_id, features):
        for cell in cells:
            if 'untouched' not in cell:
                cell['untouched'] = []
-            if feature in cell['ranges'].keys():
-                del cell['ranges'][feature]
-            elif feature in cell['categories'].keys():
-                del cell['categories'][feature]
-            cell['untouched'].append(feature)
+            for feature in features:
+                if feature in cell['ranges'].keys():
+                    del cell['ranges'][feature]
+                elif feature in cell['categories'].keys():
+                    del cell['categories'][feature]
+                cell['untouched'].append(feature)
            cells_by_id[cell['id']] = cell.copy()

    @staticmethod