Data and Model wrappers (#26)

* Squashed commit of wrappers: Wrapper minimizer * apply dataset wrapper on minimizer * apply changes on minimization notebook * add black_box_access and unlimited_queries params Dataset wrapper anonymizer Add features_names to ArrayDataset and allow providing features names in QI and Cat features not just indexes update notebooks categorical features and QI passed by indexes dataset include feature names and is_pandas param add pytorch Dataset Remove redundant code. Use data wrappers in model wrapper APIs. add generic dataset components Create initial version of wrappers for models * Fix handling of categorical features
2026-05-02 16:22:37 +02:00 · 2022-04-27 12:33:27 +03:00 · 2022-04-27 12:33:27 +03:00 · 2b2dab6bef
commit 2b2dab6bef
parent d53818644e
17 changed files with 1340 additions and 752 deletions
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -8,6 +8,7 @@ from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.preprocessing import OneHotEncoder
+from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE

 from typing import Union, Optional

@ -49,61 +50,64 @@ class Anonymize:
        self.categorical_features = categorical_features
        self.is_regression = is_regression
        self.train_only_QI = train_only_QI
+        self.features_names = None
+        self.features = None

-    def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \
-            -> Union[np.ndarray, pd.DataFrame]:
+    def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
        """
        Method for performing model-guided anonymization.

-        :param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and
-                  categorical data.
-        :param y: The predictions of the original model on the training data.
+        :param dataset: Data wrapper containing the training data for the model and the predictions of the
+                        original model on the training data.
        :return: An array containing the anonymized training dataset.
        """
-        if type(x) == np.ndarray:
-            self.features = [i for i in range(x.shape[1])]
-            return self._anonymize_ndarray(x.copy(), y)
-        else:  # pandas
-            self.features = x.columns
-            if not self.categorical_features:
-                raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
-            return self._anonymize_pandas(x.copy(), y)
+        if dataset.get_samples().shape[1] != 0:
+            self.features = [i for i in range(dataset.get_samples().shape[1])]
+        else:
+            raise ValueError('No data provided')

-    def _anonymize_ndarray(self, x, y):
+        if dataset.features_names is not None:
+            self.features_names = dataset.features_names
+        else: # if no names provided, use numbers instead
+            self.features_names = self.features
+
+        if not set(self.quasi_identifiers).issubset(set(self.features_names)):
+            raise ValueError('Quasi identifiers should bs a subset of the supplied features or indexes in range of '
+                             'the data columns')
+        if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
+            raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
+                             'the data columns')
+        self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
+        if self.categorical_features:
+            self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]
+
+        transformed = self._anonymize(dataset.get_samples().copy(), dataset.get_labels())
+        if dataset.is_pandas:
+            return pd.DataFrame(transformed, columns=self.features_names)
+        else:
+            return transformed
+
+    def _anonymize(self, x, y):
        if x.shape[0] != y.shape[0]:
            raise ValueError("x and y should have same number of rows")
-        x_anonymizer_train = x
-        if self.train_only_QI:
-            # build DT just on QI features
-            x_anonymizer_train = x[:, self.quasi_identifiers]
        if x.dtype.kind not in 'iufc':
-            x_prepared = self._modify_categorical_features(x_anonymizer_train)
+            if not self.categorical_features:
+                raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
+            x_prepared = self._modify_categorical_features(x)
        else:
-            x_prepared = x_anonymizer_train
-        if self.is_regression:
-            self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
-        else:
-            self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
-        self.anonymizer.fit(x_prepared, y)
-        cells_by_id = self._calculate_cells(x, x_prepared)
-        return self._anonymize_data_numpy(x, x_prepared, cells_by_id)
-
-    def _anonymize_pandas(self, x, y):
-        if x.shape[0] != y.shape[0]:
-            raise ValueError("x and y should have same number of rows")
-        x_anonymizer_train = x
+            x_prepared = x
+        x_anonymizer_train = x_prepared
        if self.train_only_QI:
            # build DT just on QI features
-            x_anonymizer_train = x.loc[:, self.quasi_identifiers]
-        # need to one-hot encode before training the decision tree
-        x_prepared = self._modify_categorical_features(x_anonymizer_train)
+            x_anonymizer_train = x_prepared[:, self.quasi_identifiers]
        if self.is_regression:
            self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
        else:
            self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
-        self.anonymizer.fit(x_prepared, y)
-        cells_by_id = self._calculate_cells(x, x_prepared)
-        return self._anonymize_data_pandas(x, x_prepared, cells_by_id)
+
+        self.anonymizer.fit(x_anonymizer_train, y)
+        cells_by_id = self._calculate_cells(x, x_anonymizer_train)
+        return self._anonymize_data(x, x_anonymizer_train, cells_by_id)

    def _calculate_cells(self, x, x_anonymizer_train):
        # x is original data, x_anonymizer_train is only QIs + 1-hot encoded
@ -130,15 +134,9 @@ class Anonymize:
            # get all rows in cell
            indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
            # TODO: should we filter only those with majority label? (using hist)
-            if type(x) == np.ndarray:
-                rows = x[indexes]
-            else:  # pandas
-                rows = x.iloc[indexes]
+            rows = x[indexes]
            for feature in self.quasi_identifiers:
-                if type(x) == np.ndarray:
-                    values = rows[:, feature]
-                else:  # pandas
-                    values = rows.loc[:, feature]
+                values = rows[:, feature]
                if self.categorical_features and feature in self.categorical_features:
                    # find most common value
                    cell['representative'][feature] = Counter(values).most_common(1)[0][0]
@ -163,7 +161,7 @@ class Anonymize:
        node_ids = self._find_sample_nodes(samples)
        return [cells_by_id[node_id] for node_id in node_ids]

-    def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
+    def _anonymize_data(self, x, x_anonymizer_train, cells_by_id):
        cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
        index = 0
        for row in x:
@ -173,22 +171,12 @@ class Anonymize:
                row[feature] = cell['representative'][feature]
        return x

-    def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id):
-        cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
-        index = 0
-        for i, row in x.iterrows():
-            cell = cells[index]
-            index += 1
-            for feature in cell['representative']:
-                x.at[i, feature] = cell['representative'][feature]
-        return x
-
    def _modify_categorical_features(self, x):
        # prepare data for DT
        used_features = self.features
        if self.train_only_QI:
            used_features = self.quasi_identifiers
-        numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features]
+        numeric_features = [f for f in self.features if f in used_features and f not in self.categorical_features]
        categorical_features = [f for f in self.categorical_features if f in used_features]
        numeric_transformer = Pipeline(
            steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]