Support 1-hot encoded features in anonymization + fixes related to encoding in minimization (#86)

* Support 1-hot encoded features in anonymization (#72) * Fix anonymization adult notebook + new notebook to demonstrate anonymization on 1-hot encoded data * Minimizer: No default encoder, if none provided data is supplied to the model as is. Fix data type of representative values. Fix and add more tests. Signed-off-by: abigailt <abigailt@il.ibm.com>
2026-04-25 04:46:21 +02:00 · 2023-10-19 11:48:15 +03:00 · 2023-10-19 11:48:15 +03:00 · 5dce961092
commit 5dce961092
parent 26addd192f
7 changed files with 670 additions and 255 deletions
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -23,7 +23,11 @@ class Anonymize:
    :type k: int
    :param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
                              in case of numpy data.
-    :type quasi_identifiers: np.ndarray or list
+    :type quasi_identifiers: np.ndarray or list of strings or integers.
+    :param quasi_identifer_slices: If some of the quasi-identifiers represent 1-hot encoded features that need to remain
+                                   consistent after anonymization, provide a list containing the list of column names
+                                   or indexes that represent a single feature.
+    :type quasi_identifer_slices: list of lists of strings or integers.
    :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
                                 before using them to train the decision tree model).
    :type categorical_features: list, optional
@ -35,8 +39,12 @@ class Anonymize:
    :type train_only_QI: boolean, optional
    """

-    def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
-                 is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
+    def __init__(self, k: int,
+                 quasi_identifiers: Union[np.ndarray, list],
+                 quasi_identifer_slices: Optional[list] = None,
+                 categorical_features: Optional[list] = None,
+                 is_regression: Optional[bool] = False,
+                 train_only_QI: Optional[bool] = False):
        if k < 2:
            raise ValueError("k should be a positive integer with a value of 2 or higher")
        if quasi_identifiers is None or len(quasi_identifiers) < 1:
@ -49,6 +57,7 @@ class Anonymize:
        self.train_only_QI = train_only_QI
        self.features_names = None
        self.features = None
+        self.quasi_identifer_slices = quasi_identifer_slices

    def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
        """
@ -76,7 +85,14 @@ class Anonymize:
        if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
            raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
                             'the data columns')
+        # transform quasi identifiers to indexes
        self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
+        if self.quasi_identifer_slices:
+            temp_list = []
+            for slice in self.quasi_identifer_slices:
+                new_slice = [i for i, v in enumerate(self.features_names) if v in slice]
+                temp_list.append(new_slice)
+            self.quasi_identifer_slices = temp_list
        if self.categorical_features:
            self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]

@ -126,31 +142,49 @@ class Anonymize:
        return cells_by_id

    def _find_representatives(self, x, x_anonymizer_train, cells):
-        # x is original data, x_anonymizer_train is only QIs + 1-hot encoded
+        # x is original data (always numpy), x_anonymizer_train is only QIs + 1-hot encoded
        node_ids = self._find_sample_nodes(x_anonymizer_train)
+        if self.quasi_identifer_slices:
+            all_one_hot_features = set([feature for encoded in self.quasi_identifer_slices for feature in encoded])
+        else:
+            all_one_hot_features = set()
        for cell in cells:
            cell['representative'] = {}
            # get all rows in cell
            indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
            # TODO: should we filter only those with majority label? (using hist)
            rows = x[indexes]
-            for feature in self.quasi_identifiers:
-                values = rows[:, feature]
-                if self.categorical_features and feature in self.categorical_features:
-                    # find most common value
-                    cell['representative'][feature] = Counter(values).most_common(1)[0][0]
-                else:
-                    # find the mean value (per feature)
-                    median = np.median(values)
-                    min_value = max(values)
-                    min_dist = float("inf")
-                    for value in values:
-                        # euclidean distance between two floating point values
-                        dist = abs(value - median)
-                        if dist < min_dist:
-                            min_dist = dist
-                            min_value = value
-                    cell['representative'][feature] = min_value
+            done = set()
+            for feature in self.quasi_identifiers:  # self.quasi_identifiers are numerical indexes
+                if feature not in done:
+                    # deal with 1-hot encoded features
+                    if feature in all_one_hot_features:
+                        # find features that belong together
+                        for encoded in self.quasi_identifer_slices:
+                            if feature in encoded:
+                                values = rows[:, encoded]
+                                unique_rows, counts = np.unique(values, axis=0, return_counts=True)
+                                rep = unique_rows[np.argmax(counts)]
+                                for i, e in enumerate(encoded):
+                                    done.add(e)
+                                    cell['representative'][e] = rep[i]
+                    else:  # rest of features
+                        values = rows[:, feature]
+                        if self.categorical_features and feature in self.categorical_features:
+                            # find most common value
+                            cell['representative'][feature] = Counter(values).most_common(1)[0][0]
+                        else:
+                            # find the mean value (per feature)
+                            median = np.median(values)
+                            min_value = max(values)
+                            min_dist = float("inf")
+                            for value in values:
+                                # euclidean distance between two floating point values
+                                dist = abs(value - median)
+                                if dist < min_dist:
+                                    min_dist = dist
+                                    min_value = value
+                            cell['representative'][feature] = min_value

    def _find_sample_nodes(self, samples):
        paths = self._anonymizer.decision_path(samples).toarray()
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -10,9 +10,6 @@ import copy
 import sys
 from scipy.spatial import distance
 from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 from sklearn.utils.validation import check_is_fitted
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
@ -57,7 +54,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
                                 encoded before using them to train the decision tree model).
    :param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
-                    features)
+                    features). If not provided, the data will be fed as is directly to the estimator.
    :type encoder: sklearn OrdinalEncoder or OneHotEncoder
    :type categorical_features: list of strings, optional
    :param features_to_minimize: The features to be minimized.
@ -256,7 +253,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        # Going to fit
        # (currently not dealing with option to fit with only X and y and no estimator)
        if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
-            dtype = dataset.get_samples().dtype
            x = pd.DataFrame(dataset.get_samples(), columns=self._features)
            if not self.features_to_minimize:
                self.features_to_minimize = self._features
@ -293,21 +289,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            # collect feature data (such as min, max)
            self._feature_data = self._get_feature_data(x)

-            # default encoder in case none provided
-            if self.encoder is None:
-                numeric_features = [f for f in self._features if f not in self.categorical_features]
-                numeric_transformer = Pipeline(
-                    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
-                )
-                categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
-                self.encoder = ColumnTransformer(
-                    transformers=[
-                        ("num", numeric_transformer, numeric_features),
-                        ("cat", categorical_transformer, self.categorical_features),
-                    ]
-                )
-                self.encoder.fit(x)
-
            self.cells = []
            self._categorical_values = {}

@ -334,14 +315,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)

            # self._cells currently holds the generalization created from the tree leaves
-            self._calculate_generalizations(x_test)
-            if self.generalize_using_transform:
-                generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
-            else:
-                generalized = self._generalize_from_generalizations(x_test, self.generalizations)
+            generalized = self._generalize(x_test, x_prepared_test, nodes)

            # check accuracy
-            accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), y_test))
+            accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
            print('Initial accuracy of model on generalized data, relative to original model predictions '
                  '(base generalization derived from tree, before improvements): %f' % accuracy)

@ -364,15 +341,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

                    self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)

-                    self._calculate_generalizations(x_test)
-                    if self.generalize_using_transform:
-                        generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
-                                                                 self._cells_by_id)
-                    else:
-                        generalized = self._generalize_from_generalizations(x_test, self.generalizations)
-
-                    accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
-                                                                 y_test))
+                    generalized = self._generalize(x_test, x_prepared_test, nodes)
+                    accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
                    # if accuracy passed threshold roll back to previous iteration generalizations
                    if accuracy < self.target_accuracy:
                        self.cells = cells_previous_iter
@ -395,14 +365,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    if removed_feature is None:
                        break

-                    self._calculate_generalizations(x_test)
-                    if self.generalize_using_transform:
-                        generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
-                                                                 self._cells_by_id)
-                    else:
-                        generalized = self._generalize_from_generalizations(x_test, self.generalizations)
-                    accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
-                                                                 y_test))
+                    generalized = self._generalize(x_test, x_prepared_test, nodes)
+                    accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
                    print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))

            # self._cells currently holds the chosen generalization based on target accuracy
@ -893,7 +857,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

    def _generalize_indexes(self, original_data, cells, all_indexes):
        # prepared data include one hot encoded categorical data + QI
-        representatives = pd.DataFrame(columns=self._features)  # empty except for columns
+        dtypes = original_data.dtypes.to_dict()
+        new_dtypes = {}
+        for t in dtypes.keys():
+            new_dtypes[t] = pd.Series(dtype=dtypes[t].name)
+        representatives = pd.DataFrame(new_dtypes)  # empty except for columns
        original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)

        # iterate over cells (leaves in decision tree)
@ -925,6 +893,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

        return original_data_generalized

+    def _generalize(self, data, data_prepared, nodes):
+        self._calculate_generalizations(data)
+        if self.generalize_using_transform:
+            generalized = self._generalize_from_tree(data, data_prepared, nodes, self.cells,
+                                                     self._cells_by_id)
+        else:
+            generalized = self._generalize_from_generalizations(data, self.generalizations)
+        return generalized
+
    @staticmethod
    def _map_to_ranges_categories(samples, ranges, categories):
        all_sample_indexes = []
@ -994,18 +971,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                                         feature_data[feature],
                                                         total)
                if feature_ncp > 0:
-                    # divide by accuracy gain
-                    new_cells = copy.deepcopy(self.cells)
-                    cells_by_id = copy.deepcopy(self._cells_by_id)
-                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
-                    generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
-                                                             cells_by_id)
-                    accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
-                                                                      labels)) - current_accuracy
-                    if accuracy_gain < 0:
-                        accuracy_gain = 0
-                    if accuracy_gain != 0:
-                        feature_ncp = feature_ncp / accuracy_gain
+                    feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
+                                                                       feature_ncp, labels, current_accuracy)

                if feature_ncp < range_min:
                    range_min = feature_ncp
@ -1021,19 +988,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                                             feature_data[feature],
                                                             total)
                if feature_ncp > 0:
-                    # divide by accuracy loss
-                    new_cells = copy.deepcopy(self.cells)
-                    cells_by_id = copy.deepcopy(self._cells_by_id)
-                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
-                    generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
-                                                             cells_by_id)
-                    accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
-                                                                      labels)) - current_accuracy
+                    feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
+                                                                       feature_ncp, labels, current_accuracy)

-                    if accuracy_gain < 0:
-                        accuracy_gain = 0
-                    if accuracy_gain != 0:
-                        feature_ncp = feature_ncp / accuracy_gain
                if feature_ncp < range_min:
                    range_min = feature_ncp
                    remove_feature = feature
@ -1063,6 +1020,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            feature_ncp += cell_ncp
        return feature_ncp

+    def _normalize_ncp_by_accuracy_gain(self, original_data, prepared_data, nodes, feature, feature_ncp, labels,
+                                        current_accuracy):
+        new_cells = copy.deepcopy(self.cells)
+        cells_by_id = copy.deepcopy(self._cells_by_id)
+        GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
+        generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
+                                                 cells_by_id)
+        accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder)
+        accuracy_gain = accuracy - current_accuracy
+        if accuracy_gain < 0:
+            accuracy_gain = 0
+        if accuracy_gain != 0:
+            feature_ncp = feature_ncp / accuracy_gain
+        return feature_ncp
+
    def _calculate_generalizations(self, samples: Optional[pd.DataFrame] = None):
        ranges, range_representatives = self._calculate_ranges(self.cells)
        categories, category_representatives = self._calculate_categories(self.cells)
@ -1282,3 +1254,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

        for feature in to_remove:
            del generalizations['categories'][feature]
+
+    @staticmethod
+    def _calculate_accuracy(generalized, y_test, estimator, encoder):
+        generalized_data = encoder.transform(generalized) if encoder else generalized
+        return estimator.score(ArrayDataset(generalized_data, y_test))
--- a/apt/utils/models/pytorch_model.py
+++ b/apt/utils/models/pytorch_model.py
@ -368,7 +368,7 @@ class PyTorchClassifier(PyTorchModel):
        if validation_data is None:
            self._art_model.fit(
                x=train_data.get_samples(),
-                y=train_data.get_labels().reshape(-1, 1),
+                y=train_data.get_labels(),
                batch_size=batch_size,
                nb_epochs=nb_epochs,
                save_checkpoints=save_checkpoints,
@ -379,9 +379,9 @@ class PyTorchClassifier(PyTorchModel):
        else:
            self._art_model.fit(
                x=train_data.get_samples(),
-                y=train_data.get_labels().reshape(-1, 1),
+                y=train_data.get_labels(),
                x_validation=validation_data.get_samples(),
-                y_validation=validation_data.get_labels().reshape(-1, 1),
+                y_validation=validation_data.get_labels(),
                batch_size=batch_size,
                nb_epochs=nb_epochs,
                save_checkpoints=save_checkpoints,
--- a/notebooks/anonymization_one_hot_adult.ipynb
+++ b/notebooks/anonymization_one_hot_adult.ipynb
@ -0,0 +1,303 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using ML anonymization on one-hot encoded data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial we will show how to anonymize models using the ML anonymization module, specifically when the inout data is already one-hot encoded. \n",
+    "\n",
+    "This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/adult). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['State-gov' 'Never-married' 'Adm-clerical' ... 'White' 'Male'\n",
+      "  'UnitedStates']\n",
+      " ['Self-emp-not-inc' 'Married-civ-spouse' 'Exec-managerial' ... 'White'\n",
+      "  'Male' 'UnitedStates']\n",
+      " ['Private' 'Divorced' 'Handlers-cleaners' ... 'White' 'Male'\n",
+      "  'UnitedStates']\n",
+      " ...\n",
+      " ['Private' 'Never-married' 'Sales' ... 'White' 'Female' 'UnitedStates']\n",
+      " ['Private' 'Never-married' 'Craft-repair' ... 'White' 'Male'\n",
+      "  'UnitedStates']\n",
+      " ['Private' 'Never-married' 'Handlers-cleaners' ... 'White' 'Male'\n",
+      "  'UnitedStates']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "sys.path.insert(0, os.path.abspath('..'))\n",
+    "from apt.utils.dataset_utils import get_adult_dataset_pd\n",
+    "\n",
+    "# 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'\n",
+    "categorical_features = [1, 3, 4, 5, 6, 7, 11]\n",
+    "\n",
+    "# requires a folder called 'datasets' in the current directory\n",
+    "(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()\n",
+    "x_train = x_train.to_numpy()[:, [1, 3, 4, 5, 6, 7, 11]]\n",
+    "y_train = y_train.to_numpy().astype(int)\n",
+    "x_test = x_test.to_numpy()[:, [1, 3, 4, 5, 6, 7, 11]]\n",
+    "y_test = y_test.to_numpy().astype(int)\n",
+    "\n",
+    "# get balanced dataset\n",
+    "x_train = x_train[:x_test.shape[0]]\n",
+    "y_train = y_train[:y_test.shape[0]]\n",
+    "\n",
+    "print(x_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Encode data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " ...\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "import scipy\n",
+    "\n",
+    "preprocessor = OneHotEncoder(handle_unknown=\"ignore\")\n",
+    "\n",
+    "x_train = preprocessor.fit_transform(x_train)\n",
+    "x_test = preprocessor.transform(x_test)\n",
+    "if scipy.sparse.issparse(x_train):\n",
+    "    x_train = x_train.toarray().astype(int)\n",
+    "if scipy.sparse.issparse(x_test):\n",
+    "    x_test = x_test.toarray().astype(int)\n",
+    "\n",
+    "print(x_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train decision tree model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Base model accuracy:  0.814446287083103\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
+      "  warnings.warn(msg, category=FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier\n",
+    "\n",
+    "model = DecisionTreeClassifier()\n",
+    "model.fit(x_train, y_train)\n",
+    "\n",
+    "art_classifier = ScikitlearnDecisionTreeClassifier(model)\n",
+    "\n",
+    "print('Base model accuracy: ', model.score(x_test, y_test))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Anonymize data\n",
+    "## k=100\n",
+    "\n",
+    "The data is anonymized on the quasi-identifiers: age, education-num, capital-gain, hours-per-week and with a privact parameter k=100.\n",
+    "\n",
+    "This means that each record in the anonymized dataset is identical to 99 others on the quasi-identifier values (i.e., when looking only at those features, the records are indistinguishable)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " ...\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from apt.utils.datasets import ArrayDataset\n",
+    "from apt.anonymization import Anonymize\n",
+    "\n",
+    "x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)])\n",
+    "\n",
+    "# QI = (race, sex)\n",
+    "QI = [53, 52, 51, 50, 49, 48, 47]\n",
+    "QI_slices = [[47, 48, 49, 50, 51], [52, 53]]\n",
+    "anonymizer = Anonymize(100, QI)\n",
+    "anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
+    "print(anon)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2711"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# number of distinct rows in original data\n",
+    "len(np.unique(x_train, axis=0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2476"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# number of distinct rows in anonymized data\n",
+    "len(np.unique(anon, axis=0))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train decision tree model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Anonymized model accuracy:  0.8135863890424421\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
+      "  warnings.warn(msg, category=FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "anon_model = DecisionTreeClassifier()\n",
+    "anon_model.fit(anon, y_train)\n",
+    "\n",
+    "anon_art_classifier = ScikitlearnDecisionTreeClassifier(anon_model)\n",
+    "\n",
+    "print('Anonymized model accuracy: ', anon_model.score(x_test, y_test))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/membership_inference_anonymization_adult.ipynb
+++ b/notebooks/membership_inference_anonymization_adult.ipynb
@ -1,7 +1,6 @@
 {
 "cells": [
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -9,7 +8,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -23,13 +21,72 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load data"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[  39   13 2174    0   40]\n",
+      " [  50   13    0    0   13]\n",
+      " [  38    9    0    0   40]\n",
+      " ...\n",
+      " [  27   13    0    0   40]\n",
+      " [  26   11    0    0   48]\n",
+      " [  27    9    0    0   40]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "sys.path.insert(0, os.path.abspath('..'))\n",
+    "from apt.utils.dataset_utils import get_adult_dataset_pd\n",
+    "\n",
+    "# requires a folder called 'datasets' in the current directory\n",
+    "(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()\n",
+    "x_train = x_train.to_numpy()\n",
+    "y_train = y_train.to_numpy().astype(int)\n",
+    "x_test = x_test.to_numpy()\n",
+    "y_test = y_test.to_numpy().astype(int)\n",
+    "\n",
+    "# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
+    "x_train = x_train[:, [0, 2, 8, 9, 10]].astype(int)\n",
+    "x_test = x_test[:, [0, 2, 8, 9, 10]].astype(int)\n",
+    "\n",
+    "# get balanced dataset\n",
+    "x_train = x_train[:x_test.shape[0]]\n",
+    "y_train = y_train[:y_test.shape[0]]\n",
+    "\n",
+    "print(x_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train decision tree model"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 3,
@ -39,76 +96,14 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[[  39.   13. 2174.    0.   40.]\n",
-      " [  50.   13.    0.    0.   13.]\n",
-      " [  38.    9.    0.    0.   40.]\n",
-      " ...\n",
-      " [  27.   13.    0.    0.   40.]\n",
-      " [  26.   11.    0.    0.   48.]\n",
-      " [  27.    9.    0.    0.   40.]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
-    "x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
-    "                        usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n",
-    "\n",
-    "y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
-    "                        usecols=14, dtype=str, delimiter=\", \")\n",
-    "\n",
-    "\n",
-    "x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
-    "                        usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n",
-    "\n",
-    "y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
-    "                        usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n",
-    "\n",
-    "# Trim trailing period \".\" from label\n",
-    "y_test = np.array([a[:-1] for a in y_test])\n",
-    "\n",
-    "y_train[y_train == '<=50K'] = 0\n",
-    "y_train[y_train == '>50K'] = 1\n",
-    "y_train = y_train.astype(int)\n",
-    "\n",
-    "y_test[y_test == '<=50K'] = 0\n",
-    "y_test[y_test == '>50K'] = 1\n",
-    "y_test = y_test.astype(int)\n",
-    "\n",
-    "# get balanced dataset\n",
-    "x_train = x_train[:x_test.shape[0]]\n",
-    "y_train = y_train[:y_test.shape[0]]\n",
-    "\n",
-    "print(x_train)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Train decision tree model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Base model accuracy:  0.8076285240464345\n"
+      "Base model accuracy:  0.8087341072415699\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/home/mayaa/Development/GitHub/aiprivacy/ai-privacy-toolkit/venv1/lib/python3.8/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    }
@ -122,13 +117,10 @@
    "\n",
    "art_classifier = ScikitlearnDecisionTreeClassifier(model)\n",
    "\n",
-    "print('Base model accuracy: ', model.score(x_test, y_test))\n",
-    "\n",
-    "x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)]).reshape(-1,1)"
+    "print('Base model accuracy: ', model.score(x_test, y_test))"
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -139,7 +131,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -159,7 +151,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -168,14 +159,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "0.5460017196904557\n"
+      "0.5434836015231544\n"
     ]
    }
   ],
@ -191,7 +182,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -199,7 +189,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -213,30 +202,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[[38. 13.  0.  0. 40.]\n",
-      " [46. 13.  0.  0. 35.]\n",
-      " [28.  9.  0.  0. 40.]\n",
+      "[[38 13  0  0 40]\n",
+      " [46 13  0  0 35]\n",
+      " [28  9  0  0 40]\n",
      " ...\n",
-      " [26. 13.  0.  0. 40.]\n",
-      " [27. 10.  0.  0. 50.]\n",
-      " [28.  9.  0.  0. 40.]]\n"
+      " [26 13  0  0 40]\n",
+      " [27 10  0  0 50]\n",
+      " [28  9  0  0 40]]\n"
     ]
    }
   ],
   "source": [
-    "import os\n",
-    "import sys\n",
-    "sys.path.insert(0, os.path.abspath('..'))\n",
    "from apt.utils.datasets import ArrayDataset\n",
    "from apt.anonymization import Anonymize\n",
    "\n",
+    "x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)])\n",
+    "\n",
    "# QI = (age, education-num, capital-gain, hours-per-week)\n",
    "QI = [0, 1, 2, 4]\n",
    "anonymizer = Anonymize(100, QI)\n",
@ -246,7 +234,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -255,7 +243,7 @@
       "6739"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -267,7 +255,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@ -276,7 +264,7 @@
       "401"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -287,7 +275,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -296,21 +283,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Anonymized model accuracy:  0.826914808672686\n"
+      "Anonymized model accuracy:  0.8308457711442786\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/home/mayaa/Development/GitHub/aiprivacy/ai-privacy-toolkit/venv1/lib/python3.8/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    }
@ -325,7 +312,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -335,14 +321,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "0.49692912418621793\n"
+      "0.4944724235351923\n"
     ]
    }
   ],
@ -364,7 +350,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -380,8 +365,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "(0.5316007088009451, 0.7738607050730868)\n",
-      "(0.4971184877823882, 0.5297874953936863)\n"
+      "without anonymization: (0.5303914835164835, 0.7588748311018303)\n",
+      "with anonymization: (0.49255952380952384, 0.3659255619702739)\n"
     ]
    }
   ],
@ -411,15 +396,14 @@
    "    return precision, recall\n",
    "\n",
    "# regular\n",
-    "print(calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n",
+    "print('without anonymization:', calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n",
    "                            np.concatenate((np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb))))))\n",
    "# anon\n",
-    "print(calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n",
+    "print('with anonymization:', calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n",
    "                            np.concatenate((np.ones(len(anon_inferred_train_bb)), np.zeros(len(anon_inferred_test_bb))))))"
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
@ -429,7 +413,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -443,7 +427,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.6"
  }
 },
 "nbformat": 4,
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@ -1,5 +1,6 @@
 import pytest
 import numpy as np
+import pandas as pd
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
@ -118,6 +119,74 @@ def test_regression():
    assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())


+def test_anonymize_ndarray_one_hot():
+    x_train = np.array([[23, 0, 1, 165],
+                        [45, 0, 1, 158],
+                        [56, 1, 0, 123],
+                        [67, 0, 1, 154],
+                        [45, 1, 0, 149],
+                        [42, 1, 0, 166],
+                        [73, 0, 1, 172],
+                        [94, 0, 1, 168],
+                        [69, 0, 1, 175],
+                        [24, 1, 0, 181],
+                        [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    pred = model.predict(x_train)
+
+    k = 10
+    QI = [0, 1, 2]
+    QI_slices = [[1, 2]]
+    anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
+    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
+    assert (len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
+    _, counts_elements = np.unique(anon[:, QI], return_counts=True)
+    assert (np.min(counts_elements) >= k)
+    assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
+    anonymized_slice = anon[:, QI_slices[0]]
+    assert ((np.sum(anonymized_slice, axis=1) == 1).all())
+    assert ((np.max(anonymized_slice, axis=1) == 1).all())
+    assert ((np.min(anonymized_slice, axis=1) == 0).all())
+
+
+def test_anonymize_pandas_one_hot():
+    feature_names = ["age", "gender_M", "gender_F", "height"]
+    x_train = np.array([[23, 0, 1, 165],
+                        [45, 0, 1, 158],
+                        [56, 1, 0, 123],
+                        [67, 0, 1, 154],
+                        [45, 1, 0, 149],
+                        [42, 1, 0, 166],
+                        [73, 0, 1, 172],
+                        [94, 0, 1, 168],
+                        [69, 0, 1, 175],
+                        [24, 1, 0, 181],
+                        [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+    x_train = pd.DataFrame(x_train, columns=feature_names)
+    y_train = pd.Series(y_train)
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    pred = model.predict(x_train)
+
+    k = 10
+    QI = ["age", "gender_M", "gender_F"]
+    QI_slices = [["gender_M", "gender_F"]]
+    anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
+    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
+    assert (anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
+    assert (anon.loc[:, QI].value_counts().min() >= k)
+    np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
+    anonymized_slice = anon.loc[:, QI_slices[0]]
+    assert ((np.sum(anonymized_slice, axis=1) == 1).all())
+    assert ((np.max(anonymized_slice, axis=1) == 1).all())
+    assert ((np.min(anonymized_slice, axis=1) == 0).all())
+
+
 def test_errors():
    with pytest.raises(ValueError):
        Anonymize(1, [0, 2])
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@ -11,6 +11,8 @@ from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder

+from torch import nn, optim
+
 import tensorflow as tf
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Input
@ -24,6 +26,9 @@ from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegresso
 tf.compat.v1.disable_eager_execution()


+ACCURACY_DIFF = 0.05
+
+
@pytest.fixture
 def diabetes_dataset():
    return load_diabetes()
@ -286,7 +291,7 @@ def test_minimizer_fit(data_two_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimizer_ncp(data_two_features):
@ -348,12 +353,15 @@ def test_minimizer_ncp_categorical(data_four_features):
    train_dataset = ArrayDataset(x, predictions, features_names=features)

    gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                      categorical_features=categorical_features, generalize_using_transform=False)
+                                      categorical_features=categorical_features,
+                                      generalize_using_transform=False,
+                                      encoder=preprocessor)
    gen1.fit(dataset=train_dataset)
    ncp1 = gen1.ncp.fit_score
    ncp2 = gen1.calculate_ncp(ad1)

-    gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features)
+    gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features,
+                                      encoder=preprocessor)
    gen2.fit(dataset=train_dataset)
    ncp3 = gen2.ncp.fit_score
    gen2.transform(dataset=ad1)
@ -414,7 +422,8 @@ def test_minimizer_fit_pandas(data_four_features):
    # Now we have a full prediction pipeline.
    target_accuracy = 0.5
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features)
+                                     categorical_features=categorical_features,
+                                     encoder=preprocessor)
    train_dataset = ArrayDataset(x, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(x))
@ -428,7 +437,7 @@ def test_minimizer_fit_pandas(data_four_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimizer_params_categorical(cells_categorical):
@ -450,13 +459,14 @@ def test_minimizer_params_categorical(cells_categorical):
    # Now we have a full prediction pipeline.
    target_accuracy = 0.5
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, cells=cells)
+                                     categorical_features=categorical_features, cells=cells,
+                                     encoder=preprocessor)
    train_dataset = ArrayDataset(x, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(x))

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimizer_fit_qi(data_three_features):
@ -484,7 +494,7 @@ def test_minimizer_fit_qi(data_three_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimizer_fit_pandas_qi(data_five_features):
@ -508,7 +518,8 @@ def test_minimizer_fit_pandas_qi(data_five_features):
    # Now we have a full prediction pipeline.
    target_accuracy = 0.5
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, features_to_minimize=qi)
+                                     categorical_features=categorical_features, features_to_minimize=qi,
+                                     encoder=preprocessor)
    train_dataset = ArrayDataset(x, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(x))
@ -523,7 +534,7 @@ def test_minimizer_fit_pandas_qi(data_five_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimize_ndarray_iris():
@ -552,7 +563,7 @@ def test_minimize_ndarray_iris():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_minimize_pandas_adult():
@ -582,7 +593,8 @@ def test_minimize_pandas_adult():
        predictions = np.argmax(predictions, axis=1)
    target_accuracy = 0.7
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, features_to_minimize=qi)
+                                     categorical_features=categorical_features, features_to_minimize=qi,
+                                     encoder=preprocessor)
    gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
    transformed = gen.transform(dataset=ArrayDataset(x_train))
    gener = gen.generalizations
@ -609,7 +621,7 @@ def test_minimize_pandas_adult():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_german_credit_pandas():
@ -637,7 +649,8 @@ def test_german_credit_pandas():
        predictions = np.argmax(predictions, axis=1)
    target_accuracy = 0.7
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, features_to_minimize=qi)
+                                     categorical_features=categorical_features, features_to_minimize=qi,
+                                     encoder=preprocessor)
    gen.fit(dataset=ArrayDataset(x_train, predictions))
    transformed = gen.transform(dataset=ArrayDataset(x_train))
    gener = gen.generalizations
@ -666,7 +679,7 @@ def test_german_credit_pandas():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_regression(diabetes_dataset):
@ -726,7 +739,7 @@ def test_regression(diabetes_dataset):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_x_y():
@ -766,7 +779,7 @@ def test_x_y():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_x_y_features_names():
@ -806,7 +819,7 @@ def test_x_y_features_names():
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_BaseEstimator_classification(data_five_features):
@ -828,7 +841,8 @@ def test_BaseEstimator_classification(data_five_features):
    # Now we have a full prediction pipeline.
    target_accuracy = 0.5
    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
-                                     categorical_features=categorical_features, features_to_minimize=QI)
+                                     categorical_features=categorical_features, features_to_minimize=QI,
+                                     encoder=preprocessor)
    train_dataset = ArrayDataset(x, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(x))
@ -844,7 +858,7 @@ def test_BaseEstimator_classification(data_five_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_BaseEstimator_regression(diabetes_dataset):
@ -903,7 +917,7 @@ def test_BaseEstimator_regression(diabetes_dataset):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(transformed, predictions)
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_keras_model():
@ -936,7 +950,39 @@ def test_keras_model():
    check_ncp(ncp, gener)

    rel_accuracy = model.score(ArrayDataset(transformed, predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
+
+
+class PytorchModel(nn.Module):
+
+    def __init__(self, num_classes, num_features):
+        super(PytorchModel, self).__init__()
+
+        self.fc1 = nn.Sequential(
+            nn.Linear(num_features, 1024),
+            nn.Tanh(), )
+
+        self.fc2 = nn.Sequential(
+            nn.Linear(1024, 512),
+            nn.Tanh(), )
+
+        self.fc3 = nn.Sequential(
+            nn.Linear(512, 256),
+            nn.Tanh(), )
+
+        self.fc4 = nn.Sequential(
+            nn.Linear(256, 128),
+            nn.Tanh(),
+        )
+
+        self.classifier = nn.Linear(128, num_classes)
+
+    def forward(self, x):
+        out = self.fc1(x)
+        out = self.fc2(out)
+        out = self.fc3(out)
+        out = self.fc4(out)
+        return self.classifier(out)


 def test_minimizer_pytorch(data_three_features):
@ -944,49 +990,17 @@ def test_minimizer_pytorch(data_three_features):
    x = x.astype(np.float32)
    qi = ['age', 'weight']

-    from torch import nn, optim
    from apt.utils.datasets.datasets import PytorchData
    from apt.utils.models.pytorch_model import PyTorchClassifier

-    class pytorch_model(nn.Module):
-
-        def __init__(self, num_classes, num_features):
-            super(pytorch_model, self).__init__()
-
-            self.fc1 = nn.Sequential(
-                nn.Linear(num_features, 1024),
-                nn.Tanh(), )
-
-            self.fc2 = nn.Sequential(
-                nn.Linear(1024, 512),
-                nn.Tanh(), )
-
-            self.fc3 = nn.Sequential(
-                nn.Linear(512, 256),
-                nn.Tanh(), )
-
-            self.fc4 = nn.Sequential(
-                nn.Linear(256, 128),
-                nn.Tanh(),
-            )
-
-            self.classifier = nn.Linear(128, num_classes)
-
-        def forward(self, x):
-            out = self.fc1(x)
-            out = self.fc2(out)
-            out = self.fc3(out)
-            out = self.fc4(out)
-            return self.classifier(out)
-
-    base_est = pytorch_model(2, 3)
+    base_est = PytorchModel(2, 3)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(base_est.parameters(), lr=0.01)

    model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
                              optimizer=optimizer, input_shape=(3,),
                              nb_classes=2)
-    model.fit(PytorchData(x.astype(np.float32), y), save_entire_model=False, nb_epochs=10)
+    model.fit(PytorchData(x, y), save_entire_model=False, nb_epochs=10)

    ad = ArrayDataset(x)
    predictions = model.predict(ad)
@ -1006,7 +1020,41 @@ def test_minimizer_pytorch(data_three_features):
    check_ncp(ncp, expected_generalizations)

    rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
-    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
+
+
+def test_minimizer_pytorch_iris():
+    features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
+    (x_train, y_train), _ = get_iris_dataset_np()
+    x_train = x_train.astype(np.float32)
+    qi = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
+
+    from apt.utils.datasets.datasets import PytorchData
+    from apt.utils.models.pytorch_model import PyTorchClassifier
+
+    base_est = PytorchModel(3, 4)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(base_est.parameters(), lr=0.01)
+
+    model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
+                              optimizer=optimizer, input_shape=(4,),
+                              nb_classes=3)
+    model.fit(PytorchData(x_train, y_train), save_entire_model=False, nb_epochs=10)
+
+    predictions = model.predict(ArrayDataset(x_train))
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)
+    target_accuracy = 0.99
+    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
+    transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
+    gener = gen.generalizations
+
+    check_features(features, gener, transformed, x_train)
+    ncp = gen.ncp.transform_score
+    check_ncp(ncp, gener)
+
+    rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


 def test_untouched():