Support 1-hot encoded features in anonymization + fixes related to encoding in minimization (#86)

* Support 1-hot encoded features in anonymization (#72) * Fix anonymization adult notebook + new notebook to demonstrate anonymization on 1-hot encoded data * Minimizer: No default encoder, if none provided data is supplied to the model as is. Fix data type of representative values. Fix and add more tests. Signed-off-by: abigailt <abigailt@il.ibm.com>
2026-06-29 15:59:38 +02:00 · 2023-10-19 11:48:15 +03:00 · 2023-10-19 11:48:15 +03:00 · 5dce961092
commit 5dce961092
parent 26addd192f
7 changed files with 670 additions and 255 deletions
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -23,7 +23,11 @@ class Anonymize:
    :type k: int
    :param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
                              in case of numpy data.
-    :type quasi_identifiers: np.ndarray or list
+    :type quasi_identifiers: np.ndarray or list of strings or integers.
+    :param quasi_identifer_slices: If some of the quasi-identifiers represent 1-hot encoded features that need to remain
+                                   consistent after anonymization, provide a list containing the list of column names
+                                   or indexes that represent a single feature.
+    :type quasi_identifer_slices: list of lists of strings or integers.
    :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
                                 before using them to train the decision tree model).
    :type categorical_features: list, optional
@ -35,8 +39,12 @@ class Anonymize:
    :type train_only_QI: boolean, optional
    """

-    def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
-                 is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
+    def __init__(self, k: int,
+                 quasi_identifiers: Union[np.ndarray, list],
+                 quasi_identifer_slices: Optional[list] = None,
+                 categorical_features: Optional[list] = None,
+                 is_regression: Optional[bool] = False,
+                 train_only_QI: Optional[bool] = False):
        if k < 2:
            raise ValueError("k should be a positive integer with a value of 2 or higher")
        if quasi_identifiers is None or len(quasi_identifiers) < 1:
@ -49,6 +57,7 @@ class Anonymize:
        self.train_only_QI = train_only_QI
        self.features_names = None
        self.features = None
+        self.quasi_identifer_slices = quasi_identifer_slices

    def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
        """
@ -76,7 +85,14 @@ class Anonymize:
        if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
            raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
                             'the data columns')
+        # transform quasi identifiers to indexes
        self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
+        if self.quasi_identifer_slices:
+            temp_list = []
+            for slice in self.quasi_identifer_slices:
+                new_slice = [i for i, v in enumerate(self.features_names) if v in slice]
+                temp_list.append(new_slice)
+            self.quasi_identifer_slices = temp_list
        if self.categorical_features:
            self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]

@ -126,31 +142,49 @@ class Anonymize:
        return cells_by_id

    def _find_representatives(self, x, x_anonymizer_train, cells):
-        # x is original data, x_anonymizer_train is only QIs + 1-hot encoded
+        # x is original data (always numpy), x_anonymizer_train is only QIs + 1-hot encoded
        node_ids = self._find_sample_nodes(x_anonymizer_train)
+        if self.quasi_identifer_slices:
+            all_one_hot_features = set([feature for encoded in self.quasi_identifer_slices for feature in encoded])
+        else:
+            all_one_hot_features = set()
        for cell in cells:
            cell['representative'] = {}
            # get all rows in cell
            indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
            # TODO: should we filter only those with majority label? (using hist)
            rows = x[indexes]
-            for feature in self.quasi_identifiers:
-                values = rows[:, feature]
-                if self.categorical_features and feature in self.categorical_features:
-                    # find most common value
-                    cell['representative'][feature] = Counter(values).most_common(1)[0][0]
-                else:
-                    # find the mean value (per feature)
-                    median = np.median(values)
-                    min_value = max(values)
-                    min_dist = float("inf")
-                    for value in values:
-                        # euclidean distance between two floating point values
-                        dist = abs(value - median)
-                        if dist < min_dist:
-                            min_dist = dist
-                            min_value = value
-                    cell['representative'][feature] = min_value
+            done = set()
+            for feature in self.quasi_identifiers:  # self.quasi_identifiers are numerical indexes
+                if feature not in done:
+                    # deal with 1-hot encoded features
+                    if feature in all_one_hot_features:
+                        # find features that belong together
+                        for encoded in self.quasi_identifer_slices:
+                            if feature in encoded:
+                                values = rows[:, encoded]
+                                unique_rows, counts = np.unique(values, axis=0, return_counts=True)
+                                rep = unique_rows[np.argmax(counts)]
+                                for i, e in enumerate(encoded):
+                                    done.add(e)
+                                    cell['representative'][e] = rep[i]
+                    else:  # rest of features
+                        values = rows[:, feature]
+                        if self.categorical_features and feature in self.categorical_features:
+                            # find most common value
+                            cell['representative'][feature] = Counter(values).most_common(1)[0][0]
+                        else:
+                            # find the mean value (per feature)
+                            median = np.median(values)
+                            min_value = max(values)
+                            min_dist = float("inf")
+                            for value in values:
+                                # euclidean distance between two floating point values
+                                dist = abs(value - median)
+                                if dist < min_dist:
+                                    min_dist = dist
+                                    min_value = value
+                            cell['representative'][feature] = min_value

    def _find_sample_nodes(self, samples):
        paths = self._anonymizer.decision_path(samples).toarray()
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -10,9 +10,6 @@ import copy
 import sys
 from scipy.spatial import distance
 from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 from sklearn.utils.validation import check_is_fitted
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
@ -57,7 +54,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
                                 encoded before using them to train the decision tree model).
    :param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
-                    features)
+                    features). If not provided, the data will be fed as is directly to the estimator.
    :type encoder: sklearn OrdinalEncoder or OneHotEncoder
    :type categorical_features: list of strings, optional
    :param features_to_minimize: The features to be minimized.
@ -256,7 +253,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        # Going to fit
        # (currently not dealing with option to fit with only X and y and no estimator)
        if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
-            dtype = dataset.get_samples().dtype
            x = pd.DataFrame(dataset.get_samples(), columns=self._features)
            if not self.features_to_minimize:
                self.features_to_minimize = self._features
@ -293,21 +289,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            # collect feature data (such as min, max)
            self._feature_data = self._get_feature_data(x)

-            # default encoder in case none provided
-            if self.encoder is None:
-                numeric_features = [f for f in self._features if f not in self.categorical_features]
-                numeric_transformer = Pipeline(
-                    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
-                )
-                categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
-                self.encoder = ColumnTransformer(
-                    transformers=[
-                        ("num", numeric_transformer, numeric_features),
-                        ("cat", categorical_transformer, self.categorical_features),
-                    ]
-                )
-                self.encoder.fit(x)
-
            self.cells = []
            self._categorical_values = {}

@ -334,14 +315,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)

            # self._cells currently holds the generalization created from the tree leaves
-            self._calculate_generalizations(x_test)
-            if self.generalize_using_transform:
-                generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
-            else:
-                generalized = self._generalize_from_generalizations(x_test, self.generalizations)
+            generalized = self._generalize(x_test, x_prepared_test, nodes)

            # check accuracy
-            accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), y_test))
+            accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
            print('Initial accuracy of model on generalized data, relative to original model predictions '
                  '(base generalization derived from tree, before improvements): %f' % accuracy)

@ -364,15 +341,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

                    self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)

-                    self._calculate_generalizations(x_test)
-                    if self.generalize_using_transform:
-                        generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
-                                                                 self._cells_by_id)
-                    else:
-                        generalized = self._generalize_from_generalizations(x_test, self.generalizations)
-
-                    accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
-                                                                 y_test))
+                    generalized = self._generalize(x_test, x_prepared_test, nodes)
+                    accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
                    # if accuracy passed threshold roll back to previous iteration generalizations
                    if accuracy < self.target_accuracy:
                        self.cells = cells_previous_iter
@ -395,14 +365,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    if removed_feature is None:
                        break

-                    self._calculate_generalizations(x_test)
-                    if self.generalize_using_transform:
-                        generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
-                                                                 self._cells_by_id)
-                    else:
-                        generalized = self._generalize_from_generalizations(x_test, self.generalizations)
-                    accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
-                                                                 y_test))
+                    generalized = self._generalize(x_test, x_prepared_test, nodes)
+                    accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
                    print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))

            # self._cells currently holds the chosen generalization based on target accuracy
@ -893,7 +857,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

    def _generalize_indexes(self, original_data, cells, all_indexes):
        # prepared data include one hot encoded categorical data + QI
-        representatives = pd.DataFrame(columns=self._features)  # empty except for columns
+        dtypes = original_data.dtypes.to_dict()
+        new_dtypes = {}
+        for t in dtypes.keys():
+            new_dtypes[t] = pd.Series(dtype=dtypes[t].name)
+        representatives = pd.DataFrame(new_dtypes)  # empty except for columns
        original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)

        # iterate over cells (leaves in decision tree)
@ -925,6 +893,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

        return original_data_generalized

+    def _generalize(self, data, data_prepared, nodes):
+        self._calculate_generalizations(data)
+        if self.generalize_using_transform:
+            generalized = self._generalize_from_tree(data, data_prepared, nodes, self.cells,
+                                                     self._cells_by_id)
+        else:
+            generalized = self._generalize_from_generalizations(data, self.generalizations)
+        return generalized
+
    @staticmethod
    def _map_to_ranges_categories(samples, ranges, categories):
        all_sample_indexes = []
@ -994,18 +971,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                                         feature_data[feature],
                                                         total)
                if feature_ncp > 0:
-                    # divide by accuracy gain
-                    new_cells = copy.deepcopy(self.cells)
-                    cells_by_id = copy.deepcopy(self._cells_by_id)
-                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
-                    generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
-                                                             cells_by_id)
-                    accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
-                                                                      labels)) - current_accuracy
-                    if accuracy_gain < 0:
-                        accuracy_gain = 0
-                    if accuracy_gain != 0:
-                        feature_ncp = feature_ncp / accuracy_gain
+                    feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
+                                                                       feature_ncp, labels, current_accuracy)

                if feature_ncp < range_min:
                    range_min = feature_ncp
@ -1021,19 +988,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                                             feature_data[feature],
                                                             total)
                if feature_ncp > 0:
-                    # divide by accuracy loss
-                    new_cells = copy.deepcopy(self.cells)
-                    cells_by_id = copy.deepcopy(self._cells_by_id)
-                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
-                    generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
-                                                             cells_by_id)
-                    accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
-                                                                      labels)) - current_accuracy
+                    feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
+                                                                       feature_ncp, labels, current_accuracy)

-                    if accuracy_gain < 0:
-                        accuracy_gain = 0
-                    if accuracy_gain != 0:
-                        feature_ncp = feature_ncp / accuracy_gain
                if feature_ncp < range_min:
                    range_min = feature_ncp
                    remove_feature = feature
@ -1063,6 +1020,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            feature_ncp += cell_ncp
        return feature_ncp

+    def _normalize_ncp_by_accuracy_gain(self, original_data, prepared_data, nodes, feature, feature_ncp, labels,
+                                        current_accuracy):
+        new_cells = copy.deepcopy(self.cells)
+        cells_by_id = copy.deepcopy(self._cells_by_id)
+        GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
+        generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
+                                                 cells_by_id)
+        accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder)
+        accuracy_gain = accuracy - current_accuracy
+        if accuracy_gain < 0:
+            accuracy_gain = 0
+        if accuracy_gain != 0:
+            feature_ncp = feature_ncp / accuracy_gain
+        return feature_ncp
+
    def _calculate_generalizations(self, samples: Optional[pd.DataFrame] = None):
        ranges, range_representatives = self._calculate_ranges(self.cells)
        categories, category_representatives = self._calculate_categories(self.cells)
@ -1282,3 +1254,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

        for feature in to_remove:
            del generalizations['categories'][feature]
+
+    @staticmethod
+    def _calculate_accuracy(generalized, y_test, estimator, encoder):
+        generalized_data = encoder.transform(generalized) if encoder else generalized
+        return estimator.score(ArrayDataset(generalized_data, y_test))
--- a/apt/utils/models/pytorch_model.py
+++ b/apt/utils/models/pytorch_model.py
@ -368,7 +368,7 @@ class PyTorchClassifier(PyTorchModel):
        if validation_data is None:
            self._art_model.fit(
                x=train_data.get_samples(),
-                y=train_data.get_labels().reshape(-1, 1),
+                y=train_data.get_labels(),
                batch_size=batch_size,
                nb_epochs=nb_epochs,
                save_checkpoints=save_checkpoints,
@ -379,9 +379,9 @@ class PyTorchClassifier(PyTorchModel):
        else:
            self._art_model.fit(
                x=train_data.get_samples(),
-                y=train_data.get_labels().reshape(-1, 1),
+                y=train_data.get_labels(),
                x_validation=validation_data.get_samples(),
-                y_validation=validation_data.get_labels().reshape(-1, 1),
+                y_validation=validation_data.get_labels(),
                batch_size=batch_size,
                nb_epochs=nb_epochs,
                save_checkpoints=save_checkpoints,