Squashed commit of the following:

commit d53818644e Author: olasaadi <92303887+olasaadi@users.noreply.github.com> Date: Mon Mar 7 20:12:55 2022 +0200 Build the dt on all features anon (#23) * add param to build the DT on all features and not just on QI * one-hot encoding only for categorical features commit c47819a031 Author: abigailt <abigailt@il.ibm.com> Date: Wed Feb 23 19:40:11 2022 +0200 Update docs commit 7e2ce7fe96 Merge: 7fbd1e4 752871d Author: abigailt <abigailt@il.ibm.com> Date: Wed Feb 23 19:26:44 2022 +0200 Merge remote-tracking branch 'origin/main' into main commit 7fbd1e4b90 Author: abigailt <abigailt@il.ibm.com> Date: Wed Feb 23 19:22:54 2022 +0200 Update version and docs commit 752871dd0c Author: olasaadi <92303887+olasaadi@users.noreply.github.com> Date: Wed Feb 23 14:57:12 2022 +0200 add minimization notebook (#22) * add german credit notebook to showcase new features (minimize only some features and categorical features) * add notebook to show minimization data on a regression problem
2026-06-08 15:05:13 +02:00 · 2022-04-25 17:39:30 +03:00 · 2022-04-25 17:39:30 +03:00 · a37ff06df8
commit a37ff06df8
parent fb2413c4aa
12 changed files with 753 additions and 69 deletions
--- a/apt/init.py
+++ b/apt/init.py
@ -6,4 +6,4 @@ from apt import anonymization
 from apt import minimization
 from apt import utils

-__version__ = "0.0.3"
+__version__ = "0.0.4"
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -3,6 +3,9 @@ import pandas as pd
 from scipy.spatial import distance
 from collections import Counter

+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.preprocessing import OneHotEncoder
 from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
@ -15,28 +18,38 @@ class Anonymize:
    Class for performing tailored, model-guided anonymization of training datasets for ML models.

    Based on the implementation described in: https://arxiv.org/abs/2007.13086
+    Parameters
+    ----------
+    k : int
+        The privacy parameter that determines the number of records that will be indistinguishable from each
+        other (when looking at the quasi identifiers). Should be at least 2.
+    quasi_identifiers : np.ndarray or list
+        The features that need to be minimized in case of pandas data, and indexes of features
+        in case of numpy data.
+    categorical_features : list, optional
+        The list of categorical features (should only be supplied when passing data as a
+        pandas dataframe.
+    is_regression : Bool, optional
+        Whether the model is a regression model or not (if False, assumes
+        a classification model). Default is False.
+    train_only_QI : Bool, optional
+        The required method to train data set for anonymization. Default is
+        to train the tree on all features.
    """

    def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
-                 is_regression=False):
-        """
-        :param k: The privacy parameter that determines the number of records that will be indistinguishable from each
-                  other (when looking at the quasi identifiers). Should be at least 2.
-        :param quasi_identifiers: The features that need to be minimized. It can be a list of feature names (strings) if
-                  dataset.feature_names is set, otherwise a list of indexes (integers).
-        :param categorical_features: The list of categorical features. It can be a list of feature names (strings) if
-                  dataset.feature_names is set, otherwise a list of indexes (integers).
-        :param is_regression: Boolean param indicates that is is a regression problem.
-        """
+                 is_regression=False, train_only_QI=False):
        if k < 2:
            raise ValueError("k should be a positive integer with a value of 2 or higher")
        if quasi_identifiers is None or len(quasi_identifiers) < 1:
            raise ValueError("The list of quasi-identifiers cannot be empty")
+
        self.k = k
        self.quasi_identifiers = quasi_identifiers
        self.categorical_features = categorical_features
        self.is_regression = is_regression
        self.features_names = None
+        self.train_only_QI = train_only_QI

    def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
        """
@ -72,7 +85,10 @@ class Anonymize:
    def _anonymize(self, x, y):
        if x.shape[0] != y.shape[0]:
            raise ValueError("x and y should have same number of rows")
-        x_anonymizer_train = x[:, self.quasi_identifiers]
+        x_anonymizer_train = x
+        if self.train_only_QI:
+            # build DT just on QI features
+            x_anonymizer_train = x[:, self.quasi_identifiers]
        if x.dtype.kind not in 'iufc':
            if not self.categorical_features:
                raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
@ -151,6 +167,21 @@ class Anonymize:
        return x

    def _modify_categorical_features(self, x):
-        encoder = OneHotEncoder()
-        one_hot_encoded = encoder.fit_transform(x)
-        return one_hot_encoded
+        # prepare data for DT
+        used_features = self.features
+        if self.train_only_QI:
+            used_features = self.quasi_identifiers
+        numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features]
+        categorical_features = [f for f in self.categorical_features if f in used_features]
+        numeric_transformer = Pipeline(
+            steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
+        )
+        categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
+        preprocessor = ColumnTransformer(
+            transformers=[
+                ("num", numeric_transformer, numeric_features),
+                ("cat", categorical_transformer, categorical_features),
+            ]
+        )
+        encoded = preprocessor.fit_transform(x)
+        return encoded
--- a/apt/minimization/init.py
+++ b/apt/minimization/init.py
@ -12,8 +12,5 @@ them to new data.

 It is also possible to export the generalizations as feature ranges.

-The current implementation supports only numeric features, so any categorical features must be transformed to a numeric
-representation before using this class.
-
 """
 from apt.minimization.minimizer import GeneralizeToRepresentative
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -22,6 +22,7 @@ from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnCl

 class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
    """ A transformer that generalizes data to representative points.
+
    Learns data generalizations based on an original model's predictions
    and a target accuracy. Once the generalizations are learned, can
    receive one or more data records and transform them to representative
@ -58,6 +59,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        The required method to train data set for minimizing. Default is
        to train the tree just on the features that are given as
        features_to_minimize.
+    is_regression : Bool, optional
+        Whether the model is a regression model or not (if False, assumes
+        a classification model). Default is False.
+
    Attributes
    ----------
    features_ : list of str
@ -69,8 +74,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        as measured on the training data.
    generalizations_ : object
        The generalizations that were learned (actual feature ranges).
-    Notes
-    -----
    """

    def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998,
@ -95,11 +98,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

    def get_params(self, deep=True):
        """Get parameters for this estimator.
+
        Parameters
        ----------
        deep : boolean, optional
            If True, will return the parameters for this estimator and contained
            subobjects that are estimators.
+
        Returns
        -------
        params : mapping of string to any
@ -116,6 +121,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

    def set_params(self, **params):
        """Set the parameters of this estimator.
+
        Returns
        -------
        self : object
@ -134,6 +140,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
                      features_names: Optional = None, dataset: Optional[ArrayDataset] = None):
        """Learns the generalizations based on training data, and applies them to the data.
+
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -158,6 +165,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
            features_names: Optional = None, dataset: ArrayDataset = None):
        """Learns the generalizations based on training data.
+
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -380,6 +388,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

    def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None):
        """ Transforms data records to representative points.
+
        Parameters
        ----------
        X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
--- a/apt/utils/dataset_utils.py
+++ b/apt/utils/dataset_utils.py
@ -18,14 +18,14 @@ def _load_iris(test_set_size: float = 0.3):
    return (x_train, y_train), (x_test, y_test)


-def get_iris_dataset():
+def get_iris_dataset(test_set: float = 0.3):
    """
    Loads the Iris dataset from scikit-learn.

    :param test_set: Proportion of the data to use as validation split (value between 0 and 1).
    :return: Entire dataset and labels as numpy array.
    """
-    return _load_iris()
+    return _load_iris(test_set)


 def _load_diabetes(test_set_size: float = 0.3):
@ -54,6 +54,7 @@ def get_german_credit_dataset(test_set: float = 0.3):
    """
    Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.

+    :param test_set: Proportion of the data to use as validation split (value between 0 and 1).
    :return: Dataset and labels as pandas dataframes.
    """

--- a/docs/conf.py
+++ b/docs/conf.py
@ -22,7 +22,7 @@ copyright = '2021, IBM'
 author = 'Abigail Goldsteen'

 # The full version, including alpha/beta/rc tags
-release = '0.0.3'
+release = '0.0.4'

 master_doc = 'index'

--- a/docs/source/tests.rst
+++ b/docs/source/tests.rst
@ -1,30 +0,0 @@
-tests package
-=============
-
-Submodules
----------
-
-tests.test\_anonymizer module
-----------------------------
-
-.. automodule:: tests.test_anonymizer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-tests.test\_minimizer module
----------------------------
-
-.. automodule:: tests.test_minimizer
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
-Module contents
---------------
-
-.. automodule:: tests
-    :members:
-    :undoc-members:
-    :show-inheritance:
--- a/notebooks/minimization_diabetes_reg.ipynb
+++ b/notebooks/minimization_diabetes_reg.ipynb
@ -0,0 +1,262 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "# Applying data minimization to a trained regression ML model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In this tutorial we will show how to perform data minimization for regression ML models using the minimization module.\n",
+    "\n",
+    "We will show you applying data minimization to a different trained regression models."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Load data\n",
+    "QI parameter determines which features will be minimized."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import load_diabetes\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "dataset = load_diabetes()\n",
+    "X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)\n",
+    "\n",
+    "features = ['age', 'sex', 'bmi', 'bp',\n",
+    "                's1', 's2', 's3', 's4', 's5', 's6']\n",
+    "QI = [0, 2, 5, 8, 9]"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Train DecisionTreeRegressor model"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Base model accuracy (R2 score):  0.15014421352446072\n"
+     ]
+    }
+   ],
+   "source": [
+    "from apt.minimization import GeneralizeToRepresentative\n",
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "\n",
+    "model1 = DecisionTreeRegressor(random_state=10, min_samples_split=2)\n",
+    "model1.fit(X_train, y_train)\n",
+    "print('Base model accuracy (R2 score): ', model1.score(X_test, y_test))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Run minimization\n",
+    "We will try to run minimization with only a subset of the features."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.108922\n",
+      "Improving accuracy\n",
+      "feature to remove: s5\n",
+      "Removed feature: s5, new relative accuracy: 0.505498\n",
+      "feature to remove: s6\n",
+      "Removed feature: s6, new relative accuracy: 0.404757\n",
+      "feature to remove: bmi\n",
+      "Removed feature: bmi, new relative accuracy: 0.718978\n",
+      "Accuracy on minimized data:  0.11604533946025941\n",
+      "generalizations:  {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 'bmi', 's6', 'bp', 's4', 's5', 'sex', 's1']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# note that is_regression param is True\n",
+    "\n",
+    "minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, features=features, is_regression=True,\n",
+    "                                    features_to_minimize=QI)\n",
+    "\n",
+    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
+    "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
+    "# data it could result in a larger gap)\n",
+    "# Don't forget to leave a hold-out set for final validation!\n",
+    "X_generalizer_train1, x_test1, y_generalizer_train1, y_test1 = train_test_split(X_test, y_test,\n",
+    "                                                                test_size = 0.4, random_state = 38)\n",
+    "\n",
+    "x_train_predictions1 = model1.predict(X_generalizer_train1)\n",
+    "minimizer1.fit(X_generalizer_train1, x_train_predictions1)\n",
+    "transformed1 = minimizer1.transform(x_test1)\n",
+    "print('Accuracy on minimized data: ', model1.score(transformed1, y_test1))\n",
+    "print('generalizations: ',minimizer1.generalizations_)#%% md"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Train linear regression model"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from sklearn.linear_model import LinearRegression\n",
+    "from apt.minimization import GeneralizeToRepresentative\n",
+    "\n",
+    "model2 = LinearRegression()\n",
+    "model2.fit(X_train, y_train)\n",
+    "print('Base model accuracy (R2 score): ', model2.score(X_test, y_test))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Run minimization\n",
+    "We will try to run minimization with only a subset of the features."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.225782\n",
+      "Improving accuracy\n",
+      "feature to remove: age\n",
+      "Removed feature: age, new relative accuracy: 0.223565\n",
+      "feature to remove: s2\n",
+      "Removed feature: s2, new relative accuracy: 0.759788\n",
+      "Accuracy on minimized data:  0.4414329261774286\n",
+      "generalizations:  {'ranges': {'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.04049498960375786, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, 0.0015758189256303012, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025, 0.06386702693998814], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.0383566590026021, -0.02800139266764745, -0.021788232028484344, -0.007290858076885343, -0.007290857844054699, 0.017561784014105797, 0.02377494378015399, 0.02791705122217536, 0.02998810407007113, 0.054840744473040104]}, 'categories': {}, 'untouched': ['s2', 's3', 'bp', 's4', 'age', 'sex', 's1']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# note that is_regression param is True\n",
+    "\n",
+    "minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, features=features, is_regression=True,\n",
+    "                                    features_to_minimize=QI)\n",
+    "\n",
+    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
+    "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
+    "# data it could result in a larger gap)\n",
+    "# Don't forget to leave a hold-out set for final validation!\n",
+    "X_generalizer_train2, x_test2, y_generalizer_train2, y_test2 = train_test_split(X_test, y_test,\n",
+    "                                                                test_size = 0.4, random_state = 38)\n",
+    "\n",
+    "x_train_predictions2 = model2.predict(X_generalizer_train2)\n",
+    "minimizer2.fit(X_generalizer_train2, x_train_predictions2)\n",
+    "transformed2 = minimizer2.transform(x_test2)\n",
+    "print('Accuracy on minimized data: ', model2.score(transformed2, y_test2))\n",
+    "print('generalizations: ',minimizer2.generalizations_)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/notebooks/minimization_german_credit.ipynb
+++ b/notebooks/minimization_german_credit.ipynb
@ -0,0 +1,385 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Applying data minimization with categorical data and only a subset of the features to a trained ML model"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
+    "\n",
+    "This will be demonstarted using the German Credit dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data)."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Load data\n",
+    "QI parameter determines which features will be minimized."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "    Existing_checking_account  Duration_in_month Credit_history Purpose  \\\n",
+      "0                         A14                 24            A32     A41   \n",
+      "1                         A14                 33            A33     A49   \n",
+      "2                         A11                  9            A32     A42   \n",
+      "3                         A14                 28            A34     A43   \n",
+      "4                         A11                 24            A33     A43   \n",
+      "..                        ...                ...            ...     ...   \n",
+      "695                       A14                 12            A32     A43   \n",
+      "696                       A14                 13            A32     A43   \n",
+      "697                       A11                 48            A30     A41   \n",
+      "698                       A12                 21            A34     A42   \n",
+      "699                       A13                 15            A32     A46   \n",
+      "\n",
+      "     Credit_amount Savings_account Present_employment_since  Installment_rate  \\\n",
+      "0             7814             A61                      A74                 3   \n",
+      "1             2764             A61                      A73                 2   \n",
+      "2             2136             A61                      A73                 3   \n",
+      "3             2743             A61                      A75                 4   \n",
+      "4             1659             A61                      A72                 4   \n",
+      "..             ...             ...                      ...               ...   \n",
+      "695           1963             A61                      A74                 4   \n",
+      "696           1409             A62                      A71                 2   \n",
+      "697           4605             A61                      A75                 3   \n",
+      "698           2745             A64                      A74                 3   \n",
+      "699           1905             A61                      A75                 4   \n",
+      "\n",
+      "    Personal_status_sex debtors  Present_residence Property  Age  \\\n",
+      "0                   A93    A101                  3     A123   38   \n",
+      "1                   A92    A101                  2     A123   26   \n",
+      "2                   A93    A101                  2     A121   25   \n",
+      "3                   A93    A101                  2     A123   29   \n",
+      "4                   A92    A101                  2     A123   29   \n",
+      "..                  ...     ...                ...      ...  ...   \n",
+      "695                 A93    A101                  2     A123   31   \n",
+      "696                 A92    A101                  4     A121   64   \n",
+      "697                 A93    A101                  4     A124   24   \n",
+      "698                 A93    A101                  2     A123   32   \n",
+      "699                 A93    A101                  4     A123   40   \n",
+      "\n",
+      "    Other_installment_plans Housing  Number_of_existing_credits   Job  \\\n",
+      "0                      A143    A152                           1  A174   \n",
+      "1                      A143    A152                           2  A173   \n",
+      "2                      A143    A152                           1  A173   \n",
+      "3                      A143    A152                           2  A173   \n",
+      "4                      A143    A151                           1  A172   \n",
+      "..                      ...     ...                         ...   ...   \n",
+      "695                    A143    A151                           2  A174   \n",
+      "696                    A143    A152                           1  A173   \n",
+      "697                    A143    A153                           2  A173   \n",
+      "698                    A143    A152                           2  A173   \n",
+      "699                    A143    A151                           1  A174   \n",
+      "\n",
+      "     N_people_being_liable_provide_maintenance  Telephone  Foreign_worker  \n",
+      "0                                            1          1               1  \n",
+      "1                                            1          1               1  \n",
+      "2                                            1          0               1  \n",
+      "3                                            1          0               1  \n",
+      "4                                            1          1               1  \n",
+      "..                                         ...        ...             ...  \n",
+      "695                                          2          1               1  \n",
+      "696                                          1          0               1  \n",
+      "697                                          2          0               1  \n",
+      "698                                          1          1               1  \n",
+      "699                                          1          1               1  \n",
+      "\n",
+      "[700 rows x 20 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from apt.utils import get_german_credit_dataset\n",
+    "\n",
+    "(x_train, y_train), (x_test, y_test) = get_german_credit_dataset()\n",
+    "features = [\"Existing_checking_account\", \"Duration_in_month\", \"Credit_history\", \"Purpose\", \"Credit_amount\",\n",
+    "                \"Savings_account\", \"Present_employment_since\", \"Installment_rate\", \"Personal_status_sex\", \"debtors\",\n",
+    "                \"Present_residence\", \"Property\", \"Age\", \"Other_installment_plans\", \"Housing\",\n",
+    "                \"Number_of_existing_credits\", \"Job\", \"N_people_being_liable_provide_maintenance\", \"Telephone\",\n",
+    "                \"Foreign_worker\"]\n",
+    "categorical_features = [\"Existing_checking_account\", \"Credit_history\", \"Purpose\", \"Savings_account\",\n",
+    "                        \"Present_employment_since\", \"Personal_status_sex\", \"debtors\", \"Property\",\n",
+    "                        \"Other_installment_plans\", \"Housing\", \"Job\"]\n",
+    "QI = [\"Duration_in_month\", \"Credit_history\", \"Purpose\", \"debtors\", \"Property\", \"Other_installment_plans\",\n",
+    "      \"Housing\", \"Job\"]\n",
+    "\n",
+    "print(x_train)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Train decision tree model\n",
+    "we use OneHotEncoder to handle categorical features."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Base model accuracy:  0.7033333333333334\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "numeric_features = [f for f in features if f not in categorical_features]\n",
+    "numeric_transformer = Pipeline(\n",
+    "    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
+    ")\n",
+    "categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
+    "preprocessor = ColumnTransformer(\n",
+    "    transformers=[\n",
+    "        (\"num\", numeric_transformer, numeric_features),\n",
+    "        (\"cat\", categorical_transformer, categorical_features),\n",
+    "    ]\n",
+    ")\n",
+    "encoded_train = preprocessor.fit_transform(x_train)\n",
+    "model = DecisionTreeClassifier()\n",
+    "model.fit(encoded_train, y_train)\n",
+    "\n",
+    "encoded_test = preprocessor.transform(x_test)\n",
+    "print('Base model accuracy: ', model.score(encoded_test, y_test))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Run minimization\n",
+    "We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
+      "Improving accuracy\n",
+      "feature to remove: Property\n",
+      "Removed feature: Property, new relative accuracy: 0.819444\n",
+      "feature to remove: Other_installment_plans\n",
+      "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
+      "feature to remove: Job\n",
+      "Removed feature: Job, new relative accuracy: 0.833333\n",
+      "feature to remove: Housing\n",
+      "Removed feature: Housing, new relative accuracy: 0.833333\n",
+      "feature to remove: Purpose\n",
+      "Removed feature: Purpose, new relative accuracy: 0.916667\n",
+      "feature to remove: Credit_history\n",
+      "Removed feature: Credit_history, new relative accuracy: 0.930556\n",
+      "feature to remove: debtors\n",
+      "Removed feature: debtors, new relative accuracy: 0.944444\n",
+      "feature to remove: Duration_in_month\n",
+      "Removed feature: Duration_in_month, new relative accuracy: 1.000000\n",
+      "Accuracy on minimized data:  0.6666666666666666\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "sys.path.insert(0, os.path.abspath('..'))\n",
+    "\n",
+    "from apt.minimization import GeneralizeToRepresentative\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# default target_accuracy is 0.998\n",
+    "minimizer = GeneralizeToRepresentative(model, features=features,\n",
+    "                                     categorical_features=categorical_features, features_to_minimize=QI)\n",
+    "\n",
+    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
+    "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
+    "# data it could result in a larger gap)\n",
+    "# Don't forget to leave a hold-out set for final validation!\n",
+    "X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
+    "                                                                test_size = 0.4, random_state = 38)\n",
+    "X_generalizer_train.reset_index(drop=True, inplace=True)\n",
+    "y_generalizer_train.reset_index(drop=True, inplace=True)\n",
+    "x_test.reset_index(drop=True, inplace=True)\n",
+    "y_test.reset_index(drop=True, inplace=True)\n",
+    "encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
+    "x_train_predictions = model.predict(encoded_generalizer_train)\n",
+    "minimizer.fit(X_generalizer_train, x_train_predictions)\n",
+    "transformed = minimizer.transform(x_test)\n",
+    "\n",
+    "encoded_transformed = preprocessor.transform(transformed)\n",
+    "print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "#### Let's see what features were generalized"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'ranges': {}, 'categories': {}, 'untouched': ['Purpose', 'Present_residence', 'Credit_history', 'Telephone', 'Job', 'Housing', 'Installment_rate', 'Number_of_existing_credits', 'Foreign_worker', 'Existing_checking_account', 'Other_installment_plans', 'N_people_being_liable_provide_maintenance', 'Property', 'Savings_account', 'Present_employment_since', 'Personal_status_sex', 'Duration_in_month', 'debtors', 'Credit_amount', 'Age']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "generalizations = minimizer.generalizations\n",
+    "print(generalizations)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
+    "\n",
+    "Let's change to a slightly lower target accuracy."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
+      "Improving accuracy\n",
+      "feature to remove: Property\n",
+      "Removed feature: Property, new relative accuracy: 0.819444\n",
+      "feature to remove: Other_installment_plans\n",
+      "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
+      "feature to remove: Job\n",
+      "Removed feature: Job, new relative accuracy: 0.833333\n",
+      "feature to remove: Housing\n",
+      "Removed feature: Housing, new relative accuracy: 0.833333\n",
+      "feature to remove: Purpose\n",
+      "Removed feature: Purpose, new relative accuracy: 0.916667\n",
+      "feature to remove: Credit_history\n",
+      "Removed feature: Credit_history, new relative accuracy: 0.930556\n",
+      "Accuracy on minimized data:  0.6416666666666667\n",
+      "{'ranges': {'Duration_in_month': [7.0, 8.5, 11.0, 13.0, 14.0, 18.0, 23.0, 25.5, 34.5, 47.5]}, 'categories': {'debtors': [['A101', 'A102'], ['A103']]}, 'untouched': ['Existing_checking_account', 'Savings_account', 'Present_employment_since', 'Property', 'Housing', 'Purpose', 'Personal_status_sex', 'Present_residence', 'Credit_history', 'Telephone', 'Installment_rate', 'Other_installment_plans', 'Number_of_existing_credits', 'Credit_amount', 'N_people_being_liable_provide_maintenance', 'Foreign_worker', 'Age', 'Job']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# We allow a 1% deviation in accuracy from the original model accuracy\n",
+    "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, features=features,\n",
+    "                                     categorical_features=categorical_features, features_to_minimize=QI)\n",
+    "\n",
+    "minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
+    "transformed2 = minimizer2.transform(x_test)\n",
+    "\n",
+    "encoded_transformed2 = preprocessor.transform(transformed2)\n",
+    "print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test))\n",
+    "generalizations2 = minimizer2.generalizations\n",
+    "print(generalizations2)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "This time we were able to generalize two features (Duration_in_month and debtors)."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/setup.cfg
+++ b/setup.cfg
@ -1,7 +1,7 @@
 [metadata]
 # replace with your username:
 name = ai-privacy-toolkit
-version = 0.0.3
+version = 0.0.4
 author = Abigail Goldsteen
 author_email = abigailt@il.ibm.com
 description = A toolkit for tools and techniques related to the privacy and compliance of AI models.
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@ -1,5 +1,8 @@
 import pytest
 import numpy as np
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.preprocessing import OneHotEncoder

@ -19,7 +22,7 @@ def test_anonymize_ndarray_iris():

    k = 10
    QI = [0, 2]
-    anonymizer = Anonymize(k, QI)
+    anonymizer = Anonymize(k, QI, train_only_QI=True)
    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
    assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
    _, counts_elements = np.unique(anon[:, QI], return_counts=True)
@ -29,18 +32,31 @@ def test_anonymize_ndarray_iris():

 def test_anonymize_pandas_adult():
    (x_train, y_train), _ = get_adult_dataset()
-    encoded = OneHotEncoder().fit_transform(x_train)
-    model = DecisionTreeClassifier()
-    model.fit(encoded, y_train)
-    pred = model.predict(encoded)

    k = 100
-    features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
-                'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
+    features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation',
+                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
    QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
          'native-country']
    categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
                            'native-country']
+    # prepare data for DT
+    numeric_features = [f for f in features if f not in categorical_features]
+    numeric_transformer = Pipeline(
+        steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
+    )
+    categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("num", numeric_transformer, numeric_features),
+            ("cat", categorical_transformer, categorical_features),
+        ]
+    )
+    encoded = preprocessor.fit_transform(x_train)
+    model = DecisionTreeClassifier()
+    model.fit(encoded, y_train)
+    pred = model.predict(encoded)
+
    anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
    anon = anonymizer.anonymize(ArrayDataset(x_train, pred, features))

@ -50,17 +66,30 @@ def test_anonymize_pandas_adult():

 def test_anonymize_pandas_nursery():
    (x_train, y_train), _ = get_nursery_dataset()
-    features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"]
    x_train = x_train.astype(str)
-    encoded = OneHotEncoder().fit_transform(x_train)
+
+    k = 100
+    features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"]
+    QI = ["finance", "social", "health"]
+    categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
+    # prepare data for DT
+    numeric_features = [f for f in features if f not in categorical_features]
+    numeric_transformer = Pipeline(
+        steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
+    )
+    categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("num", numeric_transformer, numeric_features),
+            ("cat", categorical_transformer, categorical_features),
+        ]
+    )
+    encoded = preprocessor.fit_transform(x_train)
    model = DecisionTreeClassifier()
    model.fit(encoded, y_train)
    pred = model.predict(encoded)

-    k = 100
-    QI = ["finance", "social", "health"]
-    categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
-    anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
+    anonymizer = Anonymize(k, QI, categorical_features=categorical_features, train_only_QI=True)
    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))

    assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
@ -78,7 +107,7 @@ def test_regression():
    pred = model.predict(x_train)
    k = 10
    QI = [0, 2, 5, 8]
-    anonymizer = Anonymize(k, QI, is_regression=True)
+    anonymizer = Anonymize(k, QI, is_regression=True, train_only_QI=True)
    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
    print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
    model.fit(anon, y_train)
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@ -796,7 +796,7 @@ def test_BaseEstimator_regression():
    transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
    print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
    model.fit(transformed, y_train)
-    print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
+    print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test))
    gener = gen.generalizations_
    expexted_generalizations = {'ranges': {
        'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,