From a814404534bf9c6377eacc595971713fdbf86658 Mon Sep 17 00:00:00 2001
From: abigailt <abigailt@il.ibm.com>
Date: Thu, 21 Sep 2023 19:00:27 +0300
Subject: [PATCH] Support 1-hot encoded features in anonymization (#72)

Signed-off-by: abigailt <abigailt@il.ibm.com>
---
 apt/anonymization/anonymizer.py | 76 ++++++++++++++++++++++++---------
 tests/test_anonymizer.py        | 29 +++++++++++++
 2 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/apt/anonymization/anonymizer.py b/apt/anonymization/anonymizer.py
index cd7f097..e254eea 100644
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@@ -23,7 +23,11 @@ class Anonymize:
     :type k: int
     :param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
                               in case of numpy data.
-    :type quasi_identifiers: np.ndarray or list
+    :type quasi_identifiers: np.ndarray or list of strings or integers.
+    :param quasi_identifer_slices: If some of the quasi-identifiers represent 1-hot encoded features that need to remain
+                                   consistent after anonymization, provide a list containing the list of column names
+                                   or indexes that represent a single feature.
+    :type quasi_identifer_slices: list of lists of strings or integers.
     :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
                                  before using them to train the decision tree model).
     :type categorical_features: list, optional
@@ -35,8 +39,12 @@ class Anonymize:
     :type train_only_QI: boolean, optional
     """
 
-    def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
-                 is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
+    def __init__(self, k: int,
+                 quasi_identifiers: Union[np.ndarray, list],
+                 quasi_identifer_slices: Optional[list[list]] = None,
+                 categorical_features: Optional[list] = None,
+                 is_regression: Optional[bool] = False,
+                 train_only_QI: Optional[bool] = False):
         if k < 2:
             raise ValueError("k should be a positive integer with a value of 2 or higher")
         if quasi_identifiers is None or len(quasi_identifiers) < 1:
@@ -49,6 +57,7 @@ class Anonymize:
         self.train_only_QI = train_only_QI
         self.features_names = None
         self.features = None
+        self.quasi_identifer_slices = quasi_identifer_slices
 
     def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
         """
@@ -76,7 +85,14 @@ class Anonymize:
         if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
             raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
                              'the data columns')
+        # transform quasi identifiers to indexes
         self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
+        if self.quasi_identifer_slices:
+            temp_list = []
+            for slice in self.quasi_identifer_slices:
+                new_slice = [i for i, v in enumerate(self.features_names) if v in slice]
+                temp_list.append(new_slice)
+            self.quasi_identifer_slices = temp_list
         if self.categorical_features:
             self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]
 
@@ -126,31 +142,49 @@ class Anonymize:
         return cells_by_id
 
     def _find_representatives(self, x, x_anonymizer_train, cells):
-        # x is original data, x_anonymizer_train is only QIs + 1-hot encoded
+        # x is original data (always numpy), x_anonymizer_train is only QIs + 1-hot encoded
         node_ids = self._find_sample_nodes(x_anonymizer_train)
+        if self.quasi_identifer_slices:
+            all_one_hot_features = set([feature for encoded in self.quasi_identifer_slices for feature in encoded])
+        else:
+            all_one_hot_features = set()
         for cell in cells:
             cell['representative'] = {}
             # get all rows in cell
             indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
             # TODO: should we filter only those with majority label? (using hist)
             rows = x[indexes]
-            for feature in self.quasi_identifiers:
-                values = rows[:, feature]
-                if self.categorical_features and feature in self.categorical_features:
-                    # find most common value
-                    cell['representative'][feature] = Counter(values).most_common(1)[0][0]
-                else:
-                    # find the mean value (per feature)
-                    median = np.median(values)
-                    min_value = max(values)
-                    min_dist = float("inf")
-                    for value in values:
-                        # euclidean distance between two floating point values
-                        dist = abs(value - median)
-                        if dist < min_dist:
-                            min_dist = dist
-                            min_value = value
-                    cell['representative'][feature] = min_value
+            done = set()
+            for feature in self.quasi_identifiers:  # self.quasi_identifiers are numerical indexes
+                if feature not in done:
+                    # deal with 1-hot encoded features
+                    if feature in all_one_hot_features:
+                        # find features that belong together
+                        for encoded in self.quasi_identifer_slices:
+                            if feature in encoded:
+                                values = rows[:, encoded]
+                                unique_rows, counts = np.unique(values, axis=0, return_counts=True)
+                                rep = unique_rows[np.argmax(counts)]
+                                for i, e in enumerate(encoded):
+                                    done.add(e)
+                                    cell['representative'][e] = rep[i]
+                    else:  # rest of features
+                        values = rows[:, feature]
+                        if self.categorical_features and feature in self.categorical_features:
+                            # find most common value
+                            cell['representative'][feature] = Counter(values).most_common(1)[0][0]
+                        else:
+                            # find the mean value (per feature)
+                            median = np.median(values)
+                            min_value = max(values)
+                            min_dist = float("inf")
+                            for value in values:
+                                # euclidean distance between two floating point values
+                                dist = abs(value - median)
+                                if dist < min_dist:
+                                    min_dist = dist
+                                    min_value = value
+                            cell['representative'][feature] = min_value
 
     def _find_sample_nodes(self, samples):
         paths = self._anonymizer.decision_path(samples).toarray()
diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py
index 22dc36c..633a7b0 100644
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@@ -118,6 +118,35 @@ def test_regression():
     assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
 
 
+def test_anonymize_ndarray_one_hot():
+    x_train = np.array([[23, 0, 1, 165],
+                          [45, 0, 1, 158],
+                          [56, 1, 0, 123],
+                          [67, 0, 1, 154],
+                          [45, 1, 0, 149],
+                          [42, 1, 0, 166],
+                          [73, 0, 1, 172],
+                          [94, 0, 1, 168],
+                          [69, 0, 1, 175],
+                          [24, 1, 0, 181],
+                          [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    pred = model.predict(x_train)
+
+    k = 10
+    QI = [0, 1, 2]
+    QI_slices = [[1, 2]]
+    anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
+    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
+    assert (len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
+    _, counts_elements = np.unique(anon[:, QI], return_counts=True)
+    assert (np.min(counts_elements) >= k)
+    assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
+
+
 def test_errors():
     with pytest.raises(ValueError):
         Anonymize(1, [0, 2])