diff --git a/apt/__init__.py b/apt/__init__.py index 99e5ad6..ea6178a 100644 --- a/apt/__init__.py +++ b/apt/__init__.py @@ -6,4 +6,4 @@ from apt import anonymization from apt import minimization from apt import utils -__version__ = "0.0.3" \ No newline at end of file +__version__ = "0.0.4" \ No newline at end of file diff --git a/apt/anonymization/anonymizer.py b/apt/anonymization/anonymizer.py index e1b4a7e..ca82c5c 100644 --- a/apt/anonymization/anonymizer.py +++ b/apt/anonymization/anonymizer.py @@ -3,6 +3,9 @@ import pandas as pd from scipy.spatial import distance from collections import Counter +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.preprocessing import OneHotEncoder from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE @@ -15,28 +18,38 @@ class Anonymize: Class for performing tailored, model-guided anonymization of training datasets for ML models. Based on the implementation described in: https://arxiv.org/abs/2007.13086 + Parameters + ---------- + k : int + The privacy parameter that determines the number of records that will be indistinguishable from each + other (when looking at the quasi identifiers). Should be at least 2. + quasi_identifiers : np.ndarray or list + The features that need to be minimized in case of pandas data, and indexes of features + in case of numpy data. + categorical_features : list, optional + The list of categorical features (should only be supplied when passing data as a + pandas dataframe. + is_regression : Bool, optional + Whether the model is a regression model or not (if False, assumes + a classification model). Default is False. + train_only_QI : Bool, optional + The required method to train data set for anonymization. Default is + to train the tree on all features. """ def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None, - is_regression=False): - """ - :param k: The privacy parameter that determines the number of records that will be indistinguishable from each - other (when looking at the quasi identifiers). Should be at least 2. - :param quasi_identifiers: The features that need to be minimized. It can be a list of feature names (strings) if - dataset.feature_names is set, otherwise a list of indexes (integers). - :param categorical_features: The list of categorical features. It can be a list of feature names (strings) if - dataset.feature_names is set, otherwise a list of indexes (integers). - :param is_regression: Boolean param indicates that is is a regression problem. - """ + is_regression=False, train_only_QI=False): if k < 2: raise ValueError("k should be a positive integer with a value of 2 or higher") if quasi_identifiers is None or len(quasi_identifiers) < 1: raise ValueError("The list of quasi-identifiers cannot be empty") + self.k = k self.quasi_identifiers = quasi_identifiers self.categorical_features = categorical_features self.is_regression = is_regression self.features_names = None + self.train_only_QI = train_only_QI def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE: """ @@ -72,7 +85,10 @@ class Anonymize: def _anonymize(self, x, y): if x.shape[0] != y.shape[0]: raise ValueError("x and y should have same number of rows") - x_anonymizer_train = x[:, self.quasi_identifiers] + x_anonymizer_train = x + if self.train_only_QI: + # build DT just on QI features + x_anonymizer_train = x[:, self.quasi_identifiers] if x.dtype.kind not in 'iufc': if not self.categorical_features: raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined') @@ -151,6 +167,21 @@ class Anonymize: return x def _modify_categorical_features(self, x): - encoder = OneHotEncoder() - one_hot_encoded = encoder.fit_transform(x) - return one_hot_encoded + # prepare data for DT + used_features = self.features + if self.train_only_QI: + used_features = self.quasi_identifiers + numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features] + categorical_features = [f for f in self.categorical_features if f in used_features] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(x) + return encoded diff --git a/apt/minimization/__init__.py b/apt/minimization/__init__.py index e9aa35d..10d0a57 100644 --- a/apt/minimization/__init__.py +++ b/apt/minimization/__init__.py @@ -12,8 +12,5 @@ them to new data. It is also possible to export the generalizations as feature ranges. -The current implementation supports only numeric features, so any categorical features must be transformed to a numeric -representation before using this class. - """ from apt.minimization.minimizer import GeneralizeToRepresentative diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 13dc619..27b6b6e 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -22,6 +22,7 @@ from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnCl class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin): """ A transformer that generalizes data to representative points. + Learns data generalizations based on an original model's predictions and a target accuracy. Once the generalizations are learned, can receive one or more data records and transform them to representative @@ -58,6 +59,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM The required method to train data set for minimizing. Default is to train the tree just on the features that are given as features_to_minimize. + is_regression : Bool, optional + Whether the model is a regression model or not (if False, assumes + a classification model). Default is False. + Attributes ---------- features_ : list of str @@ -69,8 +74,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM as measured on the training data. generalizations_ : object The generalizations that were learned (actual feature ranges). - Notes - ----- """ def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998, @@ -95,11 +98,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def get_params(self, deep=True): """Get parameters for this estimator. + Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. + Returns ------- params : mapping of string to any @@ -116,6 +121,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def set_params(self, **params): """Set the parameters of this estimator. + Returns ------- self : object @@ -134,6 +140,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: Optional[ArrayDataset] = None): """Learns the generalizations based on training data, and applies them to the data. + Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features), optional @@ -158,6 +165,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None): """Learns the generalizations based on training data. + Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features), optional @@ -380,6 +388,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None): """ Transforms data records to representative points. + Parameters ---------- X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe, diff --git a/apt/utils/dataset_utils.py b/apt/utils/dataset_utils.py index 2405f8f..e3eb959 100644 --- a/apt/utils/dataset_utils.py +++ b/apt/utils/dataset_utils.py @@ -18,14 +18,14 @@ def _load_iris(test_set_size: float = 0.3): return (x_train, y_train), (x_test, y_test) -def get_iris_dataset(): +def get_iris_dataset(test_set: float = 0.3): """ Loads the Iris dataset from scikit-learn. :param test_set: Proportion of the data to use as validation split (value between 0 and 1). :return: Entire dataset and labels as numpy array. """ - return _load_iris() + return _load_iris(test_set) def _load_diabetes(test_set_size: float = 0.3): @@ -54,6 +54,7 @@ def get_german_credit_dataset(test_set: float = 0.3): """ Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary. + :param test_set: Proportion of the data to use as validation split (value between 0 and 1). :return: Dataset and labels as pandas dataframes. """ diff --git a/docs/conf.py b/docs/conf.py index 0b26b58..36cdd76 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ copyright = '2021, IBM' author = 'Abigail Goldsteen' # The full version, including alpha/beta/rc tags -release = '0.0.3' +release = '0.0.4' master_doc = 'index' diff --git a/docs/source/tests.rst b/docs/source/tests.rst deleted file mode 100644 index b1428e0..0000000 --- a/docs/source/tests.rst +++ /dev/null @@ -1,30 +0,0 @@ -tests package -============= - -Submodules ----------- - -tests.test\_anonymizer module ------------------------------ - -.. automodule:: tests.test_anonymizer - :members: - :undoc-members: - :show-inheritance: - -tests.test\_minimizer module ----------------------------- - -.. automodule:: tests.test_minimizer - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: tests - :members: - :undoc-members: - :show-inheritance: diff --git a/notebooks/minimization_diabetes_reg.ipynb b/notebooks/minimization_diabetes_reg.ipynb new file mode 100644 index 0000000..597d77a --- /dev/null +++ b/notebooks/minimization_diabetes_reg.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Applying data minimization to a trained regression ML model" + ] + }, + { + "cell_type": "markdown", + "source": [ + "In this tutorial we will show how to perform data minimization for regression ML models using the minimization module.\n", + "\n", + "We will show you applying data minimization to a different trained regression models." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Load data\n", + "QI parameter determines which features will be minimized." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 54, + "outputs": [], + "source": [ + "from sklearn.datasets import load_diabetes\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "dataset = load_diabetes()\n", + "X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)\n", + "\n", + "features = ['age', 'sex', 'bmi', 'bp',\n", + " 's1', 's2', 's3', 's4', 's5', 's6']\n", + "QI = [0, 2, 5, 8, 9]" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Train DecisionTreeRegressor model" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 55, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base model accuracy (R2 score): 0.15014421352446072\n" + ] + } + ], + "source": [ + "from apt.minimization import GeneralizeToRepresentative\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "\n", + "model1 = DecisionTreeRegressor(random_state=10, min_samples_split=2)\n", + "model1.fit(X_train, y_train)\n", + "print('Base model accuracy (R2 score): ', model1.score(X_test, y_test))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Run minimization\n", + "We will try to run minimization with only a subset of the features." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 56, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.108922\n", + "Improving accuracy\n", + "feature to remove: s5\n", + "Removed feature: s5, new relative accuracy: 0.505498\n", + "feature to remove: s6\n", + "Removed feature: s6, new relative accuracy: 0.404757\n", + "feature to remove: bmi\n", + "Removed feature: bmi, new relative accuracy: 0.718978\n", + "Accuracy on minimized data: 0.11604533946025941\n", + "generalizations: {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 'bmi', 's6', 'bp', 's4', 's5', 'sex', 's1']}\n" + ] + } + ], + "source": [ + "# note that is_regression param is True\n", + "\n", + "minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, features=features, is_regression=True,\n", + " features_to_minimize=QI)\n", + "\n", + "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n", + "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n", + "# data it could result in a larger gap)\n", + "# Don't forget to leave a hold-out set for final validation!\n", + "X_generalizer_train1, x_test1, y_generalizer_train1, y_test1 = train_test_split(X_test, y_test,\n", + " test_size = 0.4, random_state = 38)\n", + "\n", + "x_train_predictions1 = model1.predict(X_generalizer_train1)\n", + "minimizer1.fit(X_generalizer_train1, x_train_predictions1)\n", + "transformed1 = minimizer1.transform(x_test1)\n", + "print('Accuracy on minimized data: ', model1.score(transformed1, y_test1))\n", + "print('generalizations: ',minimizer1.generalizations_)#%% md" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Train linear regression model" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from apt.minimization import GeneralizeToRepresentative\n", + "\n", + "model2 = LinearRegression()\n", + "model2.fit(X_train, y_train)\n", + "print('Base model accuracy (R2 score): ', model2.score(X_test, y_test))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Run minimization\n", + "We will try to run minimization with only a subset of the features." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 58, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.225782\n", + "Improving accuracy\n", + "feature to remove: age\n", + "Removed feature: age, new relative accuracy: 0.223565\n", + "feature to remove: s2\n", + "Removed feature: s2, new relative accuracy: 0.759788\n", + "Accuracy on minimized data: 0.4414329261774286\n", + "generalizations: {'ranges': {'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.04049498960375786, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, 0.0015758189256303012, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025, 0.06386702693998814], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.0383566590026021, -0.02800139266764745, -0.021788232028484344, -0.007290858076885343, -0.007290857844054699, 0.017561784014105797, 0.02377494378015399, 0.02791705122217536, 0.02998810407007113, 0.054840744473040104]}, 'categories': {}, 'untouched': ['s2', 's3', 'bp', 's4', 'age', 'sex', 's1']}\n" + ] + } + ], + "source": [ + "# note that is_regression param is True\n", + "\n", + "minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, features=features, is_regression=True,\n", + " features_to_minimize=QI)\n", + "\n", + "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n", + "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n", + "# data it could result in a larger gap)\n", + "# Don't forget to leave a hold-out set for final validation!\n", + "X_generalizer_train2, x_test2, y_generalizer_train2, y_test2 = train_test_split(X_test, y_test,\n", + " test_size = 0.4, random_state = 38)\n", + "\n", + "x_train_predictions2 = model2.predict(X_generalizer_train2)\n", + "minimizer2.fit(X_generalizer_train2, x_train_predictions2)\n", + "transformed2 = minimizer2.transform(x_test2)\n", + "print('Accuracy on minimized data: ', model2.score(transformed2, y_test2))\n", + "print('generalizations: ',minimizer2.generalizations_)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/notebooks/minimization_german_credit.ipynb b/notebooks/minimization_german_credit.ipynb new file mode 100644 index 0000000..03af5f0 --- /dev/null +++ b/notebooks/minimization_german_credit.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Applying data minimization with categorical data and only a subset of the features to a trained ML model" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n", + "\n", + "This will be demonstarted using the German Credit dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data)." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Load data\n", + "QI parameter determines which features will be minimized." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Existing_checking_account Duration_in_month Credit_history Purpose \\\n", + "0 A14 24 A32 A41 \n", + "1 A14 33 A33 A49 \n", + "2 A11 9 A32 A42 \n", + "3 A14 28 A34 A43 \n", + "4 A11 24 A33 A43 \n", + ".. ... ... ... ... \n", + "695 A14 12 A32 A43 \n", + "696 A14 13 A32 A43 \n", + "697 A11 48 A30 A41 \n", + "698 A12 21 A34 A42 \n", + "699 A13 15 A32 A46 \n", + "\n", + " Credit_amount Savings_account Present_employment_since Installment_rate \\\n", + "0 7814 A61 A74 3 \n", + "1 2764 A61 A73 2 \n", + "2 2136 A61 A73 3 \n", + "3 2743 A61 A75 4 \n", + "4 1659 A61 A72 4 \n", + ".. ... ... ... ... \n", + "695 1963 A61 A74 4 \n", + "696 1409 A62 A71 2 \n", + "697 4605 A61 A75 3 \n", + "698 2745 A64 A74 3 \n", + "699 1905 A61 A75 4 \n", + "\n", + " Personal_status_sex debtors Present_residence Property Age \\\n", + "0 A93 A101 3 A123 38 \n", + "1 A92 A101 2 A123 26 \n", + "2 A93 A101 2 A121 25 \n", + "3 A93 A101 2 A123 29 \n", + "4 A92 A101 2 A123 29 \n", + ".. ... ... ... ... ... \n", + "695 A93 A101 2 A123 31 \n", + "696 A92 A101 4 A121 64 \n", + "697 A93 A101 4 A124 24 \n", + "698 A93 A101 2 A123 32 \n", + "699 A93 A101 4 A123 40 \n", + "\n", + " Other_installment_plans Housing Number_of_existing_credits Job \\\n", + "0 A143 A152 1 A174 \n", + "1 A143 A152 2 A173 \n", + "2 A143 A152 1 A173 \n", + "3 A143 A152 2 A173 \n", + "4 A143 A151 1 A172 \n", + ".. ... ... ... ... \n", + "695 A143 A151 2 A174 \n", + "696 A143 A152 1 A173 \n", + "697 A143 A153 2 A173 \n", + "698 A143 A152 2 A173 \n", + "699 A143 A151 1 A174 \n", + "\n", + " N_people_being_liable_provide_maintenance Telephone Foreign_worker \n", + "0 1 1 1 \n", + "1 1 1 1 \n", + "2 1 0 1 \n", + "3 1 0 1 \n", + "4 1 1 1 \n", + ".. ... ... ... \n", + "695 2 1 1 \n", + "696 1 0 1 \n", + "697 2 0 1 \n", + "698 1 1 1 \n", + "699 1 1 1 \n", + "\n", + "[700 rows x 20 columns]\n" + ] + } + ], + "source": [ + "from apt.utils import get_german_credit_dataset\n", + "\n", + "(x_train, y_train), (x_test, y_test) = get_german_credit_dataset()\n", + "features = [\"Existing_checking_account\", \"Duration_in_month\", \"Credit_history\", \"Purpose\", \"Credit_amount\",\n", + " \"Savings_account\", \"Present_employment_since\", \"Installment_rate\", \"Personal_status_sex\", \"debtors\",\n", + " \"Present_residence\", \"Property\", \"Age\", \"Other_installment_plans\", \"Housing\",\n", + " \"Number_of_existing_credits\", \"Job\", \"N_people_being_liable_provide_maintenance\", \"Telephone\",\n", + " \"Foreign_worker\"]\n", + "categorical_features = [\"Existing_checking_account\", \"Credit_history\", \"Purpose\", \"Savings_account\",\n", + " \"Present_employment_since\", \"Personal_status_sex\", \"debtors\", \"Property\",\n", + " \"Other_installment_plans\", \"Housing\", \"Job\"]\n", + "QI = [\"Duration_in_month\", \"Credit_history\", \"Purpose\", \"debtors\", \"Property\", \"Other_installment_plans\",\n", + " \"Housing\", \"Job\"]\n", + "\n", + "print(x_train)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Train decision tree model\n", + "we use OneHotEncoder to handle categorical features." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base model accuracy: 0.7033333333333334\n" + ] + } + ], + "source": [ + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "numeric_features = [f for f in features if f not in categorical_features]\n", + "numeric_transformer = Pipeline(\n", + " steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n", + ")\n", + "categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", numeric_transformer, numeric_features),\n", + " (\"cat\", categorical_transformer, categorical_features),\n", + " ]\n", + ")\n", + "encoded_train = preprocessor.fit_transform(x_train)\n", + "model = DecisionTreeClassifier()\n", + "model.fit(encoded_train, y_train)\n", + "\n", + "encoded_test = preprocessor.transform(x_test)\n", + "print('Base model accuracy: ', model.score(encoded_test, y_test))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Run minimization\n", + "We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n", + "Improving accuracy\n", + "feature to remove: Property\n", + "Removed feature: Property, new relative accuracy: 0.819444\n", + "feature to remove: Other_installment_plans\n", + "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n", + "feature to remove: Job\n", + "Removed feature: Job, new relative accuracy: 0.833333\n", + "feature to remove: Housing\n", + "Removed feature: Housing, new relative accuracy: 0.833333\n", + "feature to remove: Purpose\n", + "Removed feature: Purpose, new relative accuracy: 0.916667\n", + "feature to remove: Credit_history\n", + "Removed feature: Credit_history, new relative accuracy: 0.930556\n", + "feature to remove: debtors\n", + "Removed feature: debtors, new relative accuracy: 0.944444\n", + "feature to remove: Duration_in_month\n", + "Removed feature: Duration_in_month, new relative accuracy: 1.000000\n", + "Accuracy on minimized data: 0.6666666666666666\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath('..'))\n", + "\n", + "from apt.minimization import GeneralizeToRepresentative\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# default target_accuracy is 0.998\n", + "minimizer = GeneralizeToRepresentative(model, features=features,\n", + " categorical_features=categorical_features, features_to_minimize=QI)\n", + "\n", + "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n", + "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n", + "# data it could result in a larger gap)\n", + "# Don't forget to leave a hold-out set for final validation!\n", + "X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n", + " test_size = 0.4, random_state = 38)\n", + "X_generalizer_train.reset_index(drop=True, inplace=True)\n", + "y_generalizer_train.reset_index(drop=True, inplace=True)\n", + "x_test.reset_index(drop=True, inplace=True)\n", + "y_test.reset_index(drop=True, inplace=True)\n", + "encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n", + "x_train_predictions = model.predict(encoded_generalizer_train)\n", + "minimizer.fit(X_generalizer_train, x_train_predictions)\n", + "transformed = minimizer.transform(x_test)\n", + "\n", + "encoded_transformed = preprocessor.transform(transformed)\n", + "print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Let's see what features were generalized" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ranges': {}, 'categories': {}, 'untouched': ['Purpose', 'Present_residence', 'Credit_history', 'Telephone', 'Job', 'Housing', 'Installment_rate', 'Number_of_existing_credits', 'Foreign_worker', 'Existing_checking_account', 'Other_installment_plans', 'N_people_being_liable_provide_maintenance', 'Property', 'Savings_account', 'Present_employment_since', 'Personal_status_sex', 'Duration_in_month', 'debtors', 'Credit_amount', 'Age']}\n" + ] + } + ], + "source": [ + "generalizations = minimizer.generalizations\n", + "print(generalizations)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n", + "\n", + "Let's change to a slightly lower target accuracy." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n", + "Improving accuracy\n", + "feature to remove: Property\n", + "Removed feature: Property, new relative accuracy: 0.819444\n", + "feature to remove: Other_installment_plans\n", + "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n", + "feature to remove: Job\n", + "Removed feature: Job, new relative accuracy: 0.833333\n", + "feature to remove: Housing\n", + "Removed feature: Housing, new relative accuracy: 0.833333\n", + "feature to remove: Purpose\n", + "Removed feature: Purpose, new relative accuracy: 0.916667\n", + "feature to remove: Credit_history\n", + "Removed feature: Credit_history, new relative accuracy: 0.930556\n", + "Accuracy on minimized data: 0.6416666666666667\n", + "{'ranges': {'Duration_in_month': [7.0, 8.5, 11.0, 13.0, 14.0, 18.0, 23.0, 25.5, 34.5, 47.5]}, 'categories': {'debtors': [['A101', 'A102'], ['A103']]}, 'untouched': ['Existing_checking_account', 'Savings_account', 'Present_employment_since', 'Property', 'Housing', 'Purpose', 'Personal_status_sex', 'Present_residence', 'Credit_history', 'Telephone', 'Installment_rate', 'Other_installment_plans', 'Number_of_existing_credits', 'Credit_amount', 'N_people_being_liable_provide_maintenance', 'Foreign_worker', 'Age', 'Job']}\n" + ] + } + ], + "source": [ + "# We allow a 1% deviation in accuracy from the original model accuracy\n", + "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, features=features,\n", + " categorical_features=categorical_features, features_to_minimize=QI)\n", + "\n", + "minimizer2.fit(X_generalizer_train, x_train_predictions)\n", + "transformed2 = minimizer2.transform(x_test)\n", + "\n", + "encoded_transformed2 = preprocessor.transform(transformed2)\n", + "print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test))\n", + "generalizations2 = minimizer2.generalizations\n", + "print(generalizations2)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "This time we were able to generalize two features (Duration_in_month and debtors)." + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index f82cdb6..2e79a5f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [metadata] # replace with your username: name = ai-privacy-toolkit -version = 0.0.3 +version = 0.0.4 author = Abigail Goldsteen author_email = abigailt@il.ibm.com description = A toolkit for tools and techniques related to the privacy and compliance of AI models. diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py index 464bd20..0547a86 100644 --- a/tests/test_anonymizer.py +++ b/tests/test_anonymizer.py @@ -1,5 +1,8 @@ import pytest import numpy as np +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.preprocessing import OneHotEncoder @@ -19,7 +22,7 @@ def test_anonymize_ndarray_iris(): k = 10 QI = [0, 2] - anonymizer = Anonymize(k, QI) + anonymizer = Anonymize(k, QI, train_only_QI=True) anon = anonymizer.anonymize(ArrayDataset(x_train, pred)) assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0))) _, counts_elements = np.unique(anon[:, QI], return_counts=True) @@ -29,18 +32,31 @@ def test_anonymize_ndarray_iris(): def test_anonymize_pandas_adult(): (x_train, y_train), _ = get_adult_dataset() - encoded = OneHotEncoder().fit_transform(x_train) - model = DecisionTreeClassifier() - model.fit(encoded, y_train) - pred = model.predict(encoded) k = 100 - features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', - 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'] + features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', + 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'] QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] + # prepare data for DT + numeric_features = [f for f in features if f not in categorical_features] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(x_train) + model = DecisionTreeClassifier() + model.fit(encoded, y_train) + pred = model.predict(encoded) + anonymizer = Anonymize(k, QI, categorical_features=categorical_features) anon = anonymizer.anonymize(ArrayDataset(x_train, pred, features)) @@ -50,17 +66,30 @@ def test_anonymize_pandas_adult(): def test_anonymize_pandas_nursery(): (x_train, y_train), _ = get_nursery_dataset() - features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"] x_train = x_train.astype(str) - encoded = OneHotEncoder().fit_transform(x_train) + + k = 100 + features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"] + QI = ["finance", "social", "health"] + categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children'] + # prepare data for DT + numeric_features = [f for f in features if f not in categorical_features] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(x_train) model = DecisionTreeClassifier() model.fit(encoded, y_train) pred = model.predict(encoded) - k = 100 - QI = ["finance", "social", "health"] - categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children'] - anonymizer = Anonymize(k, QI, categorical_features=categorical_features) + anonymizer = Anonymize(k, QI, categorical_features=categorical_features, train_only_QI=True) anon = anonymizer.anonymize(ArrayDataset(x_train, pred)) assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0]) @@ -78,7 +107,7 @@ def test_regression(): pred = model.predict(x_train) k = 10 QI = [0, 2, 5, 8] - anonymizer = Anonymize(k, QI, is_regression=True) + anonymizer = Anonymize(k, QI, is_regression=True, train_only_QI=True) anon = anonymizer.anonymize(ArrayDataset(x_train, pred)) print('Base model accuracy (R2 score): ', model.score(x_test, y_test)) model.fit(anon, y_train) diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 0d6742d..630cd49 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -796,7 +796,7 @@ def test_BaseEstimator_regression(): transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features)) print('Base model accuracy (R2 score): ', model.score(x_test, y_test)) model.fit(transformed, y_train) - print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test)) + print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test)) gener = gen.generalizations_ expexted_generalizations = {'ranges': { 'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,