Support 1-hot encoded features in anonymization (#72)

Signed-off-by: abigailt <abigailt@il.ibm.com>
2026-07-23 17:01:03 +02:00 · 2023-09-21 19:00:27 +03:00 · 2023-09-21 19:00:27 +03:00 · a814404534
commit a814404534
parent 26addd192f
2 changed files with 84 additions and 21 deletions
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -23,7 +23,11 @@ class Anonymize:
    :type k: int
    :param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
                              in case of numpy data.
-    :type quasi_identifiers: np.ndarray or list
+    :type quasi_identifiers: np.ndarray or list of strings or integers.
+    :param quasi_identifer_slices: If some of the quasi-identifiers represent 1-hot encoded features that need to remain
+                                   consistent after anonymization, provide a list containing the list of column names
+                                   or indexes that represent a single feature.
+    :type quasi_identifer_slices: list of lists of strings or integers.
    :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
                                 before using them to train the decision tree model).
    :type categorical_features: list, optional
@ -35,8 +39,12 @@ class Anonymize:
    :type train_only_QI: boolean, optional
    """

-    def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
-                 is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
+    def __init__(self, k: int,
+                 quasi_identifiers: Union[np.ndarray, list],
+                 quasi_identifer_slices: Optional[list[list]] = None,
+                 categorical_features: Optional[list] = None,
+                 is_regression: Optional[bool] = False,
+                 train_only_QI: Optional[bool] = False):
        if k < 2:
            raise ValueError("k should be a positive integer with a value of 2 or higher")
        if quasi_identifiers is None or len(quasi_identifiers) < 1:
@ -49,6 +57,7 @@ class Anonymize:
        self.train_only_QI = train_only_QI
        self.features_names = None
        self.features = None
+        self.quasi_identifer_slices = quasi_identifer_slices

    def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
        """
@ -76,7 +85,14 @@ class Anonymize:
        if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
            raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
                             'the data columns')
+        # transform quasi identifiers to indexes
        self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
+        if self.quasi_identifer_slices:
+            temp_list = []
+            for slice in self.quasi_identifer_slices:
+                new_slice = [i for i, v in enumerate(self.features_names) if v in slice]
+                temp_list.append(new_slice)
+            self.quasi_identifer_slices = temp_list
        if self.categorical_features:
            self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]

@ -126,31 +142,49 @@ class Anonymize:
        return cells_by_id

    def _find_representatives(self, x, x_anonymizer_train, cells):
-        # x is original data, x_anonymizer_train is only QIs + 1-hot encoded
+        # x is original data (always numpy), x_anonymizer_train is only QIs + 1-hot encoded
        node_ids = self._find_sample_nodes(x_anonymizer_train)
+        if self.quasi_identifer_slices:
+            all_one_hot_features = set([feature for encoded in self.quasi_identifer_slices for feature in encoded])
+        else:
+            all_one_hot_features = set()
        for cell in cells:
            cell['representative'] = {}
            # get all rows in cell
            indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
            # TODO: should we filter only those with majority label? (using hist)
            rows = x[indexes]
-            for feature in self.quasi_identifiers:
-                values = rows[:, feature]
-                if self.categorical_features and feature in self.categorical_features:
-                    # find most common value
-                    cell['representative'][feature] = Counter(values).most_common(1)[0][0]
-                else:
-                    # find the mean value (per feature)
-                    median = np.median(values)
-                    min_value = max(values)
-                    min_dist = float("inf")
-                    for value in values:
-                        # euclidean distance between two floating point values
-                        dist = abs(value - median)
-                        if dist < min_dist:
-                            min_dist = dist
-                            min_value = value
-                    cell['representative'][feature] = min_value
+            done = set()
+            for feature in self.quasi_identifiers:  # self.quasi_identifiers are numerical indexes
+                if feature not in done:
+                    # deal with 1-hot encoded features
+                    if feature in all_one_hot_features:
+                        # find features that belong together
+                        for encoded in self.quasi_identifer_slices:
+                            if feature in encoded:
+                                values = rows[:, encoded]
+                                unique_rows, counts = np.unique(values, axis=0, return_counts=True)
+                                rep = unique_rows[np.argmax(counts)]
+                                for i, e in enumerate(encoded):
+                                    done.add(e)
+                                    cell['representative'][e] = rep[i]
+                    else:  # rest of features
+                        values = rows[:, feature]
+                        if self.categorical_features and feature in self.categorical_features:
+                            # find most common value
+                            cell['representative'][feature] = Counter(values).most_common(1)[0][0]
+                        else:
+                            # find the mean value (per feature)
+                            median = np.median(values)
+                            min_value = max(values)
+                            min_dist = float("inf")
+                            for value in values:
+                                # euclidean distance between two floating point values
+                                dist = abs(value - median)
+                                if dist < min_dist:
+                                    min_dist = dist
+                                    min_value = value
+                            cell['representative'][feature] = min_value

    def _find_sample_nodes(self, samples):
        paths = self._anonymizer.decision_path(samples).toarray()
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@ -118,6 +118,35 @@ def test_regression():
    assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())


+def test_anonymize_ndarray_one_hot():
+    x_train = np.array([[23, 0, 1, 165],
+                          [45, 0, 1, 158],
+                          [56, 1, 0, 123],
+                          [67, 0, 1, 154],
+                          [45, 1, 0, 149],
+                          [42, 1, 0, 166],
+                          [73, 0, 1, 172],
+                          [94, 0, 1, 168],
+                          [69, 0, 1, 175],
+                          [24, 1, 0, 181],
+                          [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    pred = model.predict(x_train)
+
+    k = 10
+    QI = [0, 1, 2]
+    QI_slices = [[1, 2]]
+    anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
+    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
+    assert (len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
+    _, counts_elements = np.unique(anon[:, QI], return_counts=True)
+    assert (np.min(counts_elements) >= k)
+    assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
+
+
 def test_errors():
    with pytest.raises(ValueError):
        Anonymize(1, [0, 2])