Squashed commit of the following:

commit d53818644e Author: olasaadi <92303887+olasaadi@users.noreply.github.com> Date: Mon Mar 7 20:12:55 2022 +0200 Build the dt on all features anon (#23) * add param to build the DT on all features and not just on QI * one-hot encoding only for categorical features commit c47819a031 Author: abigailt <abigailt@il.ibm.com> Date: Wed Feb 23 19:40:11 2022 +0200 Update docs commit 7e2ce7fe96 Merge: 7fbd1e4 752871d Author: abigailt <abigailt@il.ibm.com> Date: Wed Feb 23 19:26:44 2022 +0200 Merge remote-tracking branch 'origin/main' into main commit 7fbd1e4b90 Author: abigailt <abigailt@il.ibm.com> Date: Wed Feb 23 19:22:54 2022 +0200 Update version and docs commit 752871dd0c Author: olasaadi <92303887+olasaadi@users.noreply.github.com> Date: Wed Feb 23 14:57:12 2022 +0200 add minimization notebook (#22) * add german credit notebook to showcase new features (minimize only some features and categorical features) * add notebook to show minimization data on a regression problem
2026-06-23 15:48:06 +02:00 · 2022-04-25 17:39:30 +03:00 · 2022-04-25 17:39:30 +03:00 · a37ff06df8
commit a37ff06df8
parent fb2413c4aa
12 changed files with 753 additions and 69 deletions
--- a/apt/init.py
+++ b/apt/init.py
@ -6,4 +6,4 @@ from apt import anonymization
 from apt import minimization
 from apt import utils
-__version__ = "0.0.3"
+__version__ = "0.0.4"
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -3,6 +3,9 @@ import pandas as pd
 from scipy.spatial import distance
 from collections import Counter
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.preprocessing import OneHotEncoder
 from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
@ -15,28 +18,38 @@ class Anonymize:
    Class for performing tailored, model-guided anonymization of training datasets for ML models.
    Based on the implementation described in: https://arxiv.org/abs/2007.13086
    Parameters
    ----------
    k : int
        The privacy parameter that determines the number of records that will be indistinguishable from each
        other (when looking at the quasi identifiers). Should be at least 2.
    quasi_identifiers : np.ndarray or list
        The features that need to be minimized in case of pandas data, and indexes of features
        in case of numpy data.
    categorical_features : list, optional
        The list of categorical features (should only be supplied when passing data as a
        pandas dataframe.
    is_regression : Bool, optional
        Whether the model is a regression model or not (if False, assumes
        a classification model). Default is False.
    train_only_QI : Bool, optional
        The required method to train data set for anonymization. Default is
        to train the tree on all features.
    """
    def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
-                 is_regression=False):
+                 is_regression=False, train_only_QI=False):
        """
        :param k: The privacy parameter that determines the number of records that will be indistinguishable from each
                  other (when looking at the quasi identifiers). Should be at least 2.
        :param quasi_identifiers: The features that need to be minimized. It can be a list of feature names (strings) if
                  dataset.feature_names is set, otherwise a list of indexes (integers).
        :param categorical_features: The list of categorical features. It can be a list of feature names (strings) if
                  dataset.feature_names is set, otherwise a list of indexes (integers).
        :param is_regression: Boolean param indicates that is is a regression problem.
        """
        if k < 2:
            raise ValueError("k should be a positive integer with a value of 2 or higher")
        if quasi_identifiers is None or len(quasi_identifiers) < 1:
            raise ValueError("The list of quasi-identifiers cannot be empty")
        self.k = k
        self.quasi_identifiers = quasi_identifiers
        self.categorical_features = categorical_features
        self.is_regression = is_regression
        self.features_names = None
        self.train_only_QI = train_only_QI
    def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
        """
@ -72,7 +85,10 @@ class Anonymize:
    def _anonymize(self, x, y):
        if x.shape[0] != y.shape[0]:
            raise ValueError("x and y should have same number of rows")
-        x_anonymizer_train = x[:, self.quasi_identifiers]
+        x_anonymizer_train = x
        if self.train_only_QI:
            # build DT just on QI features
            x_anonymizer_train = x[:, self.quasi_identifiers]
        if x.dtype.kind not in 'iufc':
            if not self.categorical_features:
                raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
@ -151,6 +167,21 @@ class Anonymize:
        return x
    def _modify_categorical_features(self, x):
-        encoder = OneHotEncoder()
+        # prepare data for DT
-        one_hot_encoded = encoder.fit_transform(x)
+        used_features = self.features
-        return one_hot_encoded
+        if self.train_only_QI:
            used_features = self.quasi_identifiers
        numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features]
        categorical_features = [f for f in self.categorical_features if f in used_features]
        numeric_transformer = Pipeline(
            steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
        )
        categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ]
        )
        encoded = preprocessor.fit_transform(x)
        return encoded
--- a/apt/minimization/init.py
+++ b/apt/minimization/init.py
@ -12,8 +12,5 @@ them to new data.
 It is also possible to export the generalizations as feature ranges.
 The current implementation supports only numeric features, so any categorical features must be transformed to a numeric
 representation before using this class.
 """
 from apt.minimization.minimizer import GeneralizeToRepresentative
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -22,6 +22,7 @@ from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnCl
 class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
    """ A transformer that generalizes data to representative points.
    Learns data generalizations based on an original model's predictions
    and a target accuracy. Once the generalizations are learned, can
    receive one or more data records and transform them to representative
@ -58,6 +59,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        The required method to train data set for minimizing. Default is
        to train the tree just on the features that are given as
        features_to_minimize.
    is_regression : Bool, optional
        Whether the model is a regression model or not (if False, assumes
        a classification model). Default is False.
    Attributes
    ----------
    features_ : list of str
@ -69,8 +74,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        as measured on the training data.
    generalizations_ : object
        The generalizations that were learned (actual feature ranges).
    Notes
    -----
    """
    def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998,
@ -95,11 +98,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    def get_params(self, deep=True):
        """Get parameters for this estimator.
        Parameters
        ----------
        deep : boolean, optional
            If True, will return the parameters for this estimator and contained
            subobjects that are estimators.
        Returns
        -------
        params : mapping of string to any
@ -116,6 +121,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    def set_params(self, **params):
        """Set the parameters of this estimator.
        Returns
        -------
        self : object
@ -134,6 +140,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
                      features_names: Optional = None, dataset: Optional[ArrayDataset] = None):
        """Learns the generalizations based on training data, and applies them to the data.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -158,6 +165,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
            features_names: Optional = None, dataset: ArrayDataset = None):
        """Learns the generalizations based on training data.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -380,6 +388,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None):
        """ Transforms data records to representative points.
        Parameters
        ----------
        X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
--- a/apt/utils/dataset_utils.py
+++ b/apt/utils/dataset_utils.py
@ -18,14 +18,14 @@ def _load_iris(test_set_size: float = 0.3):
    return (x_train, y_train), (x_test, y_test)
-def get_iris_dataset():
+def get_iris_dataset(test_set: float = 0.3):
    """
    Loads the Iris dataset from scikit-learn.
    :param test_set: Proportion of the data to use as validation split (value between 0 and 1).
    :return: Entire dataset and labels as numpy array.
    """
-    return _load_iris()
+    return _load_iris(test_set)
 def _load_diabetes(test_set_size: float = 0.3):
@ -54,6 +54,7 @@ def get_german_credit_dataset(test_set: float = 0.3):
    """
    Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
    :param test_set: Proportion of the data to use as validation split (value between 0 and 1).
    :return: Dataset and labels as pandas dataframes.
    """
--- a/docs/conf.py
+++ b/docs/conf.py
@ -22,7 +22,7 @@ copyright = '2021, IBM'
 author = 'Abigail Goldsteen'
 # The full version, including alpha/beta/rc tags
-release = '0.0.3'
+release = '0.0.4'
 master_doc = 'index'
--- a/docs/source/tests.rst
+++ b/docs/source/tests.rst
@ -1,30 +0,0 @@
 tests package
 =============
 Submodules
 ----------
 tests.test\_anonymizer module
 -----------------------------
 .. automodule:: tests.test_anonymizer
    :members:
    :undoc-members:
    :show-inheritance:
 tests.test\_minimizer module
 ----------------------------
 .. automodule:: tests.test_minimizer
    :members:
    :undoc-members:
    :show-inheritance:
 Module contents
 ---------------
 .. automodule:: tests
    :members:
    :undoc-members:
    :show-inheritance:
--- a/notebooks/minimization_diabetes_reg.ipynb
+++ b/notebooks/minimization_diabetes_reg.ipynb
@ -0,0 +1,262 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# Applying data minimization to a trained regression ML model"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "In this tutorial we will show how to perform data minimization for regression ML models using the minimization module.\n",
    "\n",
    "We will show you applying data minimization to a different trained regression models."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Load data\n",
    "QI parameter determines which features will be minimized."
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_diabetes\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "dataset = load_diabetes()\n",
    "X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)\n",
    "\n",
    "features = ['age', 'sex', 'bmi', 'bp',\n",
    "                's1', 's2', 's3', 's4', 's5', 's6']\n",
    "QI = [0, 2, 5, 8, 9]"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Train DecisionTreeRegressor model"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Base model accuracy (R2 score):  0.15014421352446072\n"
     ]
    }
   ],
   "source": [
    "from apt.minimization import GeneralizeToRepresentative\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "\n",
    "model1 = DecisionTreeRegressor(random_state=10, min_samples_split=2)\n",
    "model1.fit(X_train, y_train)\n",
    "print('Base model accuracy (R2 score): ', model1.score(X_test, y_test))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Run minimization\n",
    "We will try to run minimization with only a subset of the features."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.108922\n",
      "Improving accuracy\n",
      "feature to remove: s5\n",
      "Removed feature: s5, new relative accuracy: 0.505498\n",
      "feature to remove: s6\n",
      "Removed feature: s6, new relative accuracy: 0.404757\n",
      "feature to remove: bmi\n",
      "Removed feature: bmi, new relative accuracy: 0.718978\n",
      "Accuracy on minimized data:  0.11604533946025941\n",
      "generalizations:  {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 'bmi', 's6', 'bp', 's4', 's5', 'sex', 's1']}\n"
     ]
    }
   ],
   "source": [
    "# note that is_regression param is True\n",
    "\n",
    "minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, features=features, is_regression=True,\n",
    "                                    features_to_minimize=QI)\n",
    "\n",
    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
    "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
    "# data it could result in a larger gap)\n",
    "# Don't forget to leave a hold-out set for final validation!\n",
    "X_generalizer_train1, x_test1, y_generalizer_train1, y_test1 = train_test_split(X_test, y_test,\n",
    "                                                                test_size = 0.4, random_state = 38)\n",
    "\n",
    "x_train_predictions1 = model1.predict(X_generalizer_train1)\n",
    "minimizer1.fit(X_generalizer_train1, x_train_predictions1)\n",
    "transformed1 = minimizer1.transform(x_test1)\n",
    "print('Accuracy on minimized data: ', model1.score(transformed1, y_test1))\n",
    "print('generalizations: ',minimizer1.generalizations_)#%% md"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Train linear regression model"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "from apt.minimization import GeneralizeToRepresentative\n",
    "\n",
    "model2 = LinearRegression()\n",
    "model2.fit(X_train, y_train)\n",
    "print('Base model accuracy (R2 score): ', model2.score(X_test, y_test))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Run minimization\n",
    "We will try to run minimization with only a subset of the features."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.225782\n",
      "Improving accuracy\n",
      "feature to remove: age\n",
      "Removed feature: age, new relative accuracy: 0.223565\n",
      "feature to remove: s2\n",
      "Removed feature: s2, new relative accuracy: 0.759788\n",
      "Accuracy on minimized data:  0.4414329261774286\n",
      "generalizations:  {'ranges': {'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.04049498960375786, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, 0.0015758189256303012, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025, 0.06386702693998814], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.0383566590026021, -0.02800139266764745, -0.021788232028484344, -0.007290858076885343, -0.007290857844054699, 0.017561784014105797, 0.02377494378015399, 0.02791705122217536, 0.02998810407007113, 0.054840744473040104]}, 'categories': {}, 'untouched': ['s2', 's3', 'bp', 's4', 'age', 'sex', 's1']}\n"
     ]
    }
   ],
   "source": [
    "# note that is_regression param is True\n",
    "\n",
    "minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, features=features, is_regression=True,\n",
    "                                    features_to_minimize=QI)\n",
    "\n",
    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
    "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
    "# data it could result in a larger gap)\n",
    "# Don't forget to leave a hold-out set for final validation!\n",
    "X_generalizer_train2, x_test2, y_generalizer_train2, y_test2 = train_test_split(X_test, y_test,\n",
    "                                                                test_size = 0.4, random_state = 38)\n",
    "\n",
    "x_train_predictions2 = model2.predict(X_generalizer_train2)\n",
    "minimizer2.fit(X_generalizer_train2, x_train_predictions2)\n",
    "transformed2 = minimizer2.transform(x_test2)\n",
    "print('Accuracy on minimized data: ', model2.score(transformed2, y_test2))\n",
    "print('generalizations: ',minimizer2.generalizations_)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
--- a/notebooks/minimization_german_credit.ipynb
+++ b/notebooks/minimization_german_credit.ipynb
@ -0,0 +1,385 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Applying data minimization with categorical data and only a subset of the features to a trained ML model"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
    "\n",
    "This will be demonstarted using the German Credit dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data)."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Load data\n",
    "QI parameter determines which features will be minimized."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    Existing_checking_account  Duration_in_month Credit_history Purpose  \\\n",
      "0                         A14                 24            A32     A41   \n",
      "1                         A14                 33            A33     A49   \n",
      "2                         A11                  9            A32     A42   \n",
      "3                         A14                 28            A34     A43   \n",
      "4                         A11                 24            A33     A43   \n",
      "..                        ...                ...            ...     ...   \n",
      "695                       A14                 12            A32     A43   \n",
      "696                       A14                 13            A32     A43   \n",
      "697                       A11                 48            A30     A41   \n",
      "698                       A12                 21            A34     A42   \n",
      "699                       A13                 15            A32     A46   \n",
      "\n",
      "     Credit_amount Savings_account Present_employment_since  Installment_rate  \\\n",
      "0             7814             A61                      A74                 3   \n",
      "1             2764             A61                      A73                 2   \n",
      "2             2136             A61                      A73                 3   \n",
      "3             2743             A61                      A75                 4   \n",
      "4             1659             A61                      A72                 4   \n",
      "..             ...             ...                      ...               ...   \n",
      "695           1963             A61                      A74                 4   \n",
      "696           1409             A62                      A71                 2   \n",
      "697           4605             A61                      A75                 3   \n",
      "698           2745             A64                      A74                 3   \n",
      "699           1905             A61                      A75                 4   \n",
      "\n",
      "    Personal_status_sex debtors  Present_residence Property  Age  \\\n",
      "0                   A93    A101                  3     A123   38   \n",
      "1                   A92    A101                  2     A123   26   \n",
      "2                   A93    A101                  2     A121   25   \n",
      "3                   A93    A101                  2     A123   29   \n",
      "4                   A92    A101                  2     A123   29   \n",
      "..                  ...     ...                ...      ...  ...   \n",
      "695                 A93    A101                  2     A123   31   \n",
      "696                 A92    A101                  4     A121   64   \n",
      "697                 A93    A101                  4     A124   24   \n",
      "698                 A93    A101                  2     A123   32   \n",
      "699                 A93    A101                  4     A123   40   \n",
      "\n",
      "    Other_installment_plans Housing  Number_of_existing_credits   Job  \\\n",
      "0                      A143    A152                           1  A174   \n",
      "1                      A143    A152                           2  A173   \n",
      "2                      A143    A152                           1  A173   \n",
      "3                      A143    A152                           2  A173   \n",
      "4                      A143    A151                           1  A172   \n",
      "..                      ...     ...                         ...   ...   \n",
      "695                    A143    A151                           2  A174   \n",
      "696                    A143    A152                           1  A173   \n",
      "697                    A143    A153                           2  A173   \n",
      "698                    A143    A152                           2  A173   \n",
      "699                    A143    A151                           1  A174   \n",
      "\n",
      "     N_people_being_liable_provide_maintenance  Telephone  Foreign_worker  \n",
      "0                                            1          1               1  \n",
      "1                                            1          1               1  \n",
      "2                                            1          0               1  \n",
      "3                                            1          0               1  \n",
      "4                                            1          1               1  \n",
      "..                                         ...        ...             ...  \n",
      "695                                          2          1               1  \n",
      "696                                          1          0               1  \n",
      "697                                          2          0               1  \n",
      "698                                          1          1               1  \n",
      "699                                          1          1               1  \n",
      "\n",
      "[700 rows x 20 columns]\n"
     ]
    }
   ],
   "source": [
    "from apt.utils import get_german_credit_dataset\n",
    "\n",
    "(x_train, y_train), (x_test, y_test) = get_german_credit_dataset()\n",
    "features = [\"Existing_checking_account\", \"Duration_in_month\", \"Credit_history\", \"Purpose\", \"Credit_amount\",\n",
    "                \"Savings_account\", \"Present_employment_since\", \"Installment_rate\", \"Personal_status_sex\", \"debtors\",\n",
    "                \"Present_residence\", \"Property\", \"Age\", \"Other_installment_plans\", \"Housing\",\n",
    "                \"Number_of_existing_credits\", \"Job\", \"N_people_being_liable_provide_maintenance\", \"Telephone\",\n",
    "                \"Foreign_worker\"]\n",
    "categorical_features = [\"Existing_checking_account\", \"Credit_history\", \"Purpose\", \"Savings_account\",\n",
    "                        \"Present_employment_since\", \"Personal_status_sex\", \"debtors\", \"Property\",\n",
    "                        \"Other_installment_plans\", \"Housing\", \"Job\"]\n",
    "QI = [\"Duration_in_month\", \"Credit_history\", \"Purpose\", \"debtors\", \"Property\", \"Other_installment_plans\",\n",
    "      \"Housing\", \"Job\"]\n",
    "\n",
    "print(x_train)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Train decision tree model\n",
    "we use OneHotEncoder to handle categorical features."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Base model accuracy:  0.7033333333333334\n"
     ]
    }
   ],
   "source": [
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "numeric_features = [f for f in features if f not in categorical_features]\n",
    "numeric_transformer = Pipeline(\n",
    "    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
    ")\n",
    "categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        (\"num\", numeric_transformer, numeric_features),\n",
    "        (\"cat\", categorical_transformer, categorical_features),\n",
    "    ]\n",
    ")\n",
    "encoded_train = preprocessor.fit_transform(x_train)\n",
    "model = DecisionTreeClassifier()\n",
    "model.fit(encoded_train, y_train)\n",
    "\n",
    "encoded_test = preprocessor.transform(x_test)\n",
    "print('Base model accuracy: ', model.score(encoded_test, y_test))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Run minimization\n",
    "We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
      "Improving accuracy\n",
      "feature to remove: Property\n",
      "Removed feature: Property, new relative accuracy: 0.819444\n",
      "feature to remove: Other_installment_plans\n",
      "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
      "feature to remove: Job\n",
      "Removed feature: Job, new relative accuracy: 0.833333\n",
      "feature to remove: Housing\n",
      "Removed feature: Housing, new relative accuracy: 0.833333\n",
      "feature to remove: Purpose\n",
      "Removed feature: Purpose, new relative accuracy: 0.916667\n",
      "feature to remove: Credit_history\n",
      "Removed feature: Credit_history, new relative accuracy: 0.930556\n",
      "feature to remove: debtors\n",
      "Removed feature: debtors, new relative accuracy: 0.944444\n",
      "feature to remove: Duration_in_month\n",
      "Removed feature: Duration_in_month, new relative accuracy: 1.000000\n",
      "Accuracy on minimized data:  0.6666666666666666\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "sys.path.insert(0, os.path.abspath('..'))\n",
    "\n",
    "from apt.minimization import GeneralizeToRepresentative\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# default target_accuracy is 0.998\n",
    "minimizer = GeneralizeToRepresentative(model, features=features,\n",
    "                                     categorical_features=categorical_features, features_to_minimize=QI)\n",
    "\n",
    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
    "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
    "# data it could result in a larger gap)\n",
    "# Don't forget to leave a hold-out set for final validation!\n",
    "X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
    "                                                                test_size = 0.4, random_state = 38)\n",
    "X_generalizer_train.reset_index(drop=True, inplace=True)\n",
    "y_generalizer_train.reset_index(drop=True, inplace=True)\n",
    "x_test.reset_index(drop=True, inplace=True)\n",
    "y_test.reset_index(drop=True, inplace=True)\n",
    "encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
    "x_train_predictions = model.predict(encoded_generalizer_train)\n",
    "minimizer.fit(X_generalizer_train, x_train_predictions)\n",
    "transformed = minimizer.transform(x_test)\n",
    "\n",
    "encoded_transformed = preprocessor.transform(transformed)\n",
    "print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "#### Let's see what features were generalized"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ranges': {}, 'categories': {}, 'untouched': ['Purpose', 'Present_residence', 'Credit_history', 'Telephone', 'Job', 'Housing', 'Installment_rate', 'Number_of_existing_credits', 'Foreign_worker', 'Existing_checking_account', 'Other_installment_plans', 'N_people_being_liable_provide_maintenance', 'Property', 'Savings_account', 'Present_employment_since', 'Personal_status_sex', 'Duration_in_month', 'debtors', 'Credit_amount', 'Age']}\n"
     ]
    }
   ],
   "source": [
    "generalizations = minimizer.generalizations\n",
    "print(generalizations)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
    "\n",
    "Let's change to a slightly lower target accuracy."
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
      "Improving accuracy\n",
      "feature to remove: Property\n",
      "Removed feature: Property, new relative accuracy: 0.819444\n",
      "feature to remove: Other_installment_plans\n",
      "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
      "feature to remove: Job\n",
      "Removed feature: Job, new relative accuracy: 0.833333\n",
      "feature to remove: Housing\n",
      "Removed feature: Housing, new relative accuracy: 0.833333\n",
      "feature to remove: Purpose\n",
      "Removed feature: Purpose, new relative accuracy: 0.916667\n",
      "feature to remove: Credit_history\n",
      "Removed feature: Credit_history, new relative accuracy: 0.930556\n",
      "Accuracy on minimized data:  0.6416666666666667\n",
      "{'ranges': {'Duration_in_month': [7.0, 8.5, 11.0, 13.0, 14.0, 18.0, 23.0, 25.5, 34.5, 47.5]}, 'categories': {'debtors': [['A101', 'A102'], ['A103']]}, 'untouched': ['Existing_checking_account', 'Savings_account', 'Present_employment_since', 'Property', 'Housing', 'Purpose', 'Personal_status_sex', 'Present_residence', 'Credit_history', 'Telephone', 'Installment_rate', 'Other_installment_plans', 'Number_of_existing_credits', 'Credit_amount', 'N_people_being_liable_provide_maintenance', 'Foreign_worker', 'Age', 'Job']}\n"
     ]
    }
   ],
   "source": [
    "# We allow a 1% deviation in accuracy from the original model accuracy\n",
    "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, features=features,\n",
    "                                     categorical_features=categorical_features, features_to_minimize=QI)\n",
    "\n",
    "minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
    "transformed2 = minimizer2.transform(x_test)\n",
    "\n",
    "encoded_transformed2 = preprocessor.transform(transformed2)\n",
    "print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test))\n",
    "generalizations2 = minimizer2.generalizations\n",
    "print(generalizations2)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "This time we were able to generalize two features (Duration_in_month and debtors)."
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
--- a/setup.cfg
+++ b/setup.cfg
@ -1,7 +1,7 @@
 [metadata]
 # replace with your username:
 name = ai-privacy-toolkit
-version = 0.0.3
+version = 0.0.4
 author = Abigail Goldsteen
 author_email = abigailt@il.ibm.com
 description = A toolkit for tools and techniques related to the privacy and compliance of AI models.
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@ -1,5 +1,8 @@
 import pytest
 import numpy as np
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.preprocessing import OneHotEncoder
@ -19,7 +22,7 @@ def test_anonymize_ndarray_iris():
    k = 10
    QI = [0, 2]
-    anonymizer = Anonymize(k, QI)
+    anonymizer = Anonymize(k, QI, train_only_QI=True)
    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
    assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
    _, counts_elements = np.unique(anon[:, QI], return_counts=True)
@ -29,18 +32,31 @@ def test_anonymize_ndarray_iris():
 def test_anonymize_pandas_adult():
    (x_train, y_train), _ = get_adult_dataset()
    encoded = OneHotEncoder().fit_transform(x_train)
    model = DecisionTreeClassifier()
    model.fit(encoded, y_train)
    pred = model.predict(encoded)
    k = 100
-    features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
+    features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation',
-                'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
+                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
    QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
          'native-country']
    categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
                            'native-country']
    # prepare data for DT
    numeric_features = [f for f in features if f not in categorical_features]
    numeric_transformer = Pipeline(
        steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
    )
    categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )
    encoded = preprocessor.fit_transform(x_train)
    model = DecisionTreeClassifier()
    model.fit(encoded, y_train)
    pred = model.predict(encoded)
    anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
    anon = anonymizer.anonymize(ArrayDataset(x_train, pred, features))
@ -50,17 +66,30 @@ def test_anonymize_pandas_adult():
 def test_anonymize_pandas_nursery():
    (x_train, y_train), _ = get_nursery_dataset()
    features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"]
    x_train = x_train.astype(str)
-    encoded = OneHotEncoder().fit_transform(x_train)
+
    k = 100
    features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"]
    QI = ["finance", "social", "health"]
    categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
    # prepare data for DT
    numeric_features = [f for f in features if f not in categorical_features]
    numeric_transformer = Pipeline(
        steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
    )
    categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )
    encoded = preprocessor.fit_transform(x_train)
    model = DecisionTreeClassifier()
    model.fit(encoded, y_train)
    pred = model.predict(encoded)
-    k = 100
+    anonymizer = Anonymize(k, QI, categorical_features=categorical_features, train_only_QI=True)
    QI = ["finance", "social", "health"]
    categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
    anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
    assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
@ -78,7 +107,7 @@ def test_regression():
    pred = model.predict(x_train)
    k = 10
    QI = [0, 2, 5, 8]
-    anonymizer = Anonymize(k, QI, is_regression=True)
+    anonymizer = Anonymize(k, QI, is_regression=True, train_only_QI=True)
    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
    print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
    model.fit(anon, y_train)
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@ -796,7 +796,7 @@ def test_BaseEstimator_regression():
    transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
    print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
    model.fit(transformed, y_train)
-    print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
+    print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test))
    gener = gen.generalizations_
    expexted_generalizations = {'ranges': {
        'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,