From 9de078f9375529d9717d32838c6552f7fd329f6c Mon Sep 17 00:00:00 2001 From: abigailgold <57357634+abigailgold@users.noreply.github.com> Date: Tue, 1 Feb 2022 12:27:22 +0200 Subject: [PATCH 1/2] Update readme's with paper citations (#21) --- apt/anonymization/README.md | 7 +++++++ apt/minimization/README.md | 28 ++++++++++++++++------------ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/apt/anonymization/README.md b/apt/anonymization/README.md index 3ca161c..fb27fed 100644 --- a/apt/anonymization/README.md +++ b/apt/anonymization/README.md @@ -19,4 +19,11 @@ The following figure depicts the overall process:


+Citation +-------- +Goldsteen A., Ezov G., Shmelkin R., Moffie M., Farkash A. (2022) Anonymizing Machine Learning Models. In: Garcia-Alfaro +J., Muñoz-Tapia J.L., Navarro-Arribas G., Soriano M. (eds) Data Privacy Management, Cryptocurrencies and Blockchain +Technology. DPM 2021, CBT 2021. Lecture Notes in Computer Science, vol 13140. Springer, Cham. +https://doi.org/10.1007/978-3-030-93944-1_8 + diff --git a/apt/minimization/README.md b/apt/minimization/README.md index 0f19ede..ff302b5 100644 --- a/apt/minimization/README.md +++ b/apt/minimization/README.md @@ -37,8 +37,7 @@ The current implementation supports numeric features and categorical features. Start by training your machine learning model. In this example, we will use a ``DecisionTreeClassifier``, but any scikit-learn model can be used. We will use the iris dataset in our example. -.. code:: python - +``` from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier @@ -48,36 +47,37 @@ scikit-learn model can be used. We will use the iris dataset in our example. base_est = DecisionTreeClassifier() base_est.fit(X_train, y_train) +``` Now create the ``GeneralizeToRepresentative`` transformer and train it. Supply it with the original model and the desired target accuracy. The training process may receive the original labeled training data or the model's predictions on the data. -.. code:: python - +``` predictions = base_est.predict(X_train) gen = GeneralizeToRepresentative(base_est, target_accuracy=0.9) gen.fit(X_train, predictions) +``` Now use the transformer to transform new data, for example the test data. -.. code:: python - +``` transformed = gen.transform(X_test) +``` The transformed data has the same columns and formats as the original data, so it can be used directly to derive predictions from the original model. -.. code:: python - +``` new_predictions = base_est.predict(transformed) - +``` + To export the resulting generalizations, retrieve the ``Transformer``'s ``_generalize`` parameter. -.. code:: python - +``` generalizations = base_est._generalize - +``` + The returned object has the following structure:: { @@ -103,6 +103,10 @@ Where each value inside the range list represents a cutoff point. For example, f this example are: ``<21.5, 21.5-39.0, 39.0-51.0, 51.0-70.5, >70.5``. The ``untouched`` list represents features that were not generalized, i.e., their values should remain unchanged. +Citation +-------- +Goldsteen, A., Ezov, G., Shmelkin, R. et al. Data minimization for GDPR compliance in machine learning models. AI Ethics +(2021). https://doi.org/10.1007/s43681-021-00095-8 From 752871dd0cff14aecb1266848bd81f01cde38975 Mon Sep 17 00:00:00 2001 From: olasaadi <92303887+olasaadi@users.noreply.github.com> Date: Wed, 23 Feb 2022 14:57:12 +0200 Subject: [PATCH 2/2] add minimization notebook (#22) * add german credit notebook to showcase new features (minimize only some features and categorical features) * add notebook to show minimization data on a regression problem --- notebooks/minimization_diabetes_reg.ipynb | 262 ++++++++++++++ notebooks/minimization_german_credit.ipynb | 385 +++++++++++++++++++++ tests/test_minimizer.py | 2 +- 3 files changed, 648 insertions(+), 1 deletion(-) create mode 100644 notebooks/minimization_diabetes_reg.ipynb create mode 100644 notebooks/minimization_german_credit.ipynb diff --git a/notebooks/minimization_diabetes_reg.ipynb b/notebooks/minimization_diabetes_reg.ipynb new file mode 100644 index 0000000..597d77a --- /dev/null +++ b/notebooks/minimization_diabetes_reg.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Applying data minimization to a trained regression ML model" + ] + }, + { + "cell_type": "markdown", + "source": [ + "In this tutorial we will show how to perform data minimization for regression ML models using the minimization module.\n", + "\n", + "We will show you applying data minimization to a different trained regression models." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Load data\n", + "QI parameter determines which features will be minimized." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 54, + "outputs": [], + "source": [ + "from sklearn.datasets import load_diabetes\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "dataset = load_diabetes()\n", + "X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)\n", + "\n", + "features = ['age', 'sex', 'bmi', 'bp',\n", + " 's1', 's2', 's3', 's4', 's5', 's6']\n", + "QI = [0, 2, 5, 8, 9]" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Train DecisionTreeRegressor model" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 55, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base model accuracy (R2 score): 0.15014421352446072\n" + ] + } + ], + "source": [ + "from apt.minimization import GeneralizeToRepresentative\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "\n", + "model1 = DecisionTreeRegressor(random_state=10, min_samples_split=2)\n", + "model1.fit(X_train, y_train)\n", + "print('Base model accuracy (R2 score): ', model1.score(X_test, y_test))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Run minimization\n", + "We will try to run minimization with only a subset of the features." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 56, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.108922\n", + "Improving accuracy\n", + "feature to remove: s5\n", + "Removed feature: s5, new relative accuracy: 0.505498\n", + "feature to remove: s6\n", + "Removed feature: s6, new relative accuracy: 0.404757\n", + "feature to remove: bmi\n", + "Removed feature: bmi, new relative accuracy: 0.718978\n", + "Accuracy on minimized data: 0.11604533946025941\n", + "generalizations: {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 'bmi', 's6', 'bp', 's4', 's5', 'sex', 's1']}\n" + ] + } + ], + "source": [ + "# note that is_regression param is True\n", + "\n", + "minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, features=features, is_regression=True,\n", + " features_to_minimize=QI)\n", + "\n", + "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n", + "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n", + "# data it could result in a larger gap)\n", + "# Don't forget to leave a hold-out set for final validation!\n", + "X_generalizer_train1, x_test1, y_generalizer_train1, y_test1 = train_test_split(X_test, y_test,\n", + " test_size = 0.4, random_state = 38)\n", + "\n", + "x_train_predictions1 = model1.predict(X_generalizer_train1)\n", + "minimizer1.fit(X_generalizer_train1, x_train_predictions1)\n", + "transformed1 = minimizer1.transform(x_test1)\n", + "print('Accuracy on minimized data: ', model1.score(transformed1, y_test1))\n", + "print('generalizations: ',minimizer1.generalizations_)#%% md" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Train linear regression model" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from apt.minimization import GeneralizeToRepresentative\n", + "\n", + "model2 = LinearRegression()\n", + "model2.fit(X_train, y_train)\n", + "print('Base model accuracy (R2 score): ', model2.score(X_test, y_test))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Run minimization\n", + "We will try to run minimization with only a subset of the features." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 58, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.225782\n", + "Improving accuracy\n", + "feature to remove: age\n", + "Removed feature: age, new relative accuracy: 0.223565\n", + "feature to remove: s2\n", + "Removed feature: s2, new relative accuracy: 0.759788\n", + "Accuracy on minimized data: 0.4414329261774286\n", + "generalizations: {'ranges': {'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.04049498960375786, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, 0.0015758189256303012, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025, 0.06386702693998814], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.0383566590026021, -0.02800139266764745, -0.021788232028484344, -0.007290858076885343, -0.007290857844054699, 0.017561784014105797, 0.02377494378015399, 0.02791705122217536, 0.02998810407007113, 0.054840744473040104]}, 'categories': {}, 'untouched': ['s2', 's3', 'bp', 's4', 'age', 'sex', 's1']}\n" + ] + } + ], + "source": [ + "# note that is_regression param is True\n", + "\n", + "minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, features=features, is_regression=True,\n", + " features_to_minimize=QI)\n", + "\n", + "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n", + "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n", + "# data it could result in a larger gap)\n", + "# Don't forget to leave a hold-out set for final validation!\n", + "X_generalizer_train2, x_test2, y_generalizer_train2, y_test2 = train_test_split(X_test, y_test,\n", + " test_size = 0.4, random_state = 38)\n", + "\n", + "x_train_predictions2 = model2.predict(X_generalizer_train2)\n", + "minimizer2.fit(X_generalizer_train2, x_train_predictions2)\n", + "transformed2 = minimizer2.transform(x_test2)\n", + "print('Accuracy on minimized data: ', model2.score(transformed2, y_test2))\n", + "print('generalizations: ',minimizer2.generalizations_)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/notebooks/minimization_german_credit.ipynb b/notebooks/minimization_german_credit.ipynb new file mode 100644 index 0000000..03af5f0 --- /dev/null +++ b/notebooks/minimization_german_credit.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Applying data minimization with categorical data and only a subset of the features to a trained ML model" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n", + "\n", + "This will be demonstarted using the German Credit dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data)." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Load data\n", + "QI parameter determines which features will be minimized." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Existing_checking_account Duration_in_month Credit_history Purpose \\\n", + "0 A14 24 A32 A41 \n", + "1 A14 33 A33 A49 \n", + "2 A11 9 A32 A42 \n", + "3 A14 28 A34 A43 \n", + "4 A11 24 A33 A43 \n", + ".. ... ... ... ... \n", + "695 A14 12 A32 A43 \n", + "696 A14 13 A32 A43 \n", + "697 A11 48 A30 A41 \n", + "698 A12 21 A34 A42 \n", + "699 A13 15 A32 A46 \n", + "\n", + " Credit_amount Savings_account Present_employment_since Installment_rate \\\n", + "0 7814 A61 A74 3 \n", + "1 2764 A61 A73 2 \n", + "2 2136 A61 A73 3 \n", + "3 2743 A61 A75 4 \n", + "4 1659 A61 A72 4 \n", + ".. ... ... ... ... \n", + "695 1963 A61 A74 4 \n", + "696 1409 A62 A71 2 \n", + "697 4605 A61 A75 3 \n", + "698 2745 A64 A74 3 \n", + "699 1905 A61 A75 4 \n", + "\n", + " Personal_status_sex debtors Present_residence Property Age \\\n", + "0 A93 A101 3 A123 38 \n", + "1 A92 A101 2 A123 26 \n", + "2 A93 A101 2 A121 25 \n", + "3 A93 A101 2 A123 29 \n", + "4 A92 A101 2 A123 29 \n", + ".. ... ... ... ... ... \n", + "695 A93 A101 2 A123 31 \n", + "696 A92 A101 4 A121 64 \n", + "697 A93 A101 4 A124 24 \n", + "698 A93 A101 2 A123 32 \n", + "699 A93 A101 4 A123 40 \n", + "\n", + " Other_installment_plans Housing Number_of_existing_credits Job \\\n", + "0 A143 A152 1 A174 \n", + "1 A143 A152 2 A173 \n", + "2 A143 A152 1 A173 \n", + "3 A143 A152 2 A173 \n", + "4 A143 A151 1 A172 \n", + ".. ... ... ... ... \n", + "695 A143 A151 2 A174 \n", + "696 A143 A152 1 A173 \n", + "697 A143 A153 2 A173 \n", + "698 A143 A152 2 A173 \n", + "699 A143 A151 1 A174 \n", + "\n", + " N_people_being_liable_provide_maintenance Telephone Foreign_worker \n", + "0 1 1 1 \n", + "1 1 1 1 \n", + "2 1 0 1 \n", + "3 1 0 1 \n", + "4 1 1 1 \n", + ".. ... ... ... \n", + "695 2 1 1 \n", + "696 1 0 1 \n", + "697 2 0 1 \n", + "698 1 1 1 \n", + "699 1 1 1 \n", + "\n", + "[700 rows x 20 columns]\n" + ] + } + ], + "source": [ + "from apt.utils import get_german_credit_dataset\n", + "\n", + "(x_train, y_train), (x_test, y_test) = get_german_credit_dataset()\n", + "features = [\"Existing_checking_account\", \"Duration_in_month\", \"Credit_history\", \"Purpose\", \"Credit_amount\",\n", + " \"Savings_account\", \"Present_employment_since\", \"Installment_rate\", \"Personal_status_sex\", \"debtors\",\n", + " \"Present_residence\", \"Property\", \"Age\", \"Other_installment_plans\", \"Housing\",\n", + " \"Number_of_existing_credits\", \"Job\", \"N_people_being_liable_provide_maintenance\", \"Telephone\",\n", + " \"Foreign_worker\"]\n", + "categorical_features = [\"Existing_checking_account\", \"Credit_history\", \"Purpose\", \"Savings_account\",\n", + " \"Present_employment_since\", \"Personal_status_sex\", \"debtors\", \"Property\",\n", + " \"Other_installment_plans\", \"Housing\", \"Job\"]\n", + "QI = [\"Duration_in_month\", \"Credit_history\", \"Purpose\", \"debtors\", \"Property\", \"Other_installment_plans\",\n", + " \"Housing\", \"Job\"]\n", + "\n", + "print(x_train)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Train decision tree model\n", + "we use OneHotEncoder to handle categorical features." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base model accuracy: 0.7033333333333334\n" + ] + } + ], + "source": [ + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "numeric_features = [f for f in features if f not in categorical_features]\n", + "numeric_transformer = Pipeline(\n", + " steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n", + ")\n", + "categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", numeric_transformer, numeric_features),\n", + " (\"cat\", categorical_transformer, categorical_features),\n", + " ]\n", + ")\n", + "encoded_train = preprocessor.fit_transform(x_train)\n", + "model = DecisionTreeClassifier()\n", + "model.fit(encoded_train, y_train)\n", + "\n", + "encoded_test = preprocessor.transform(x_test)\n", + "print('Base model accuracy: ', model.score(encoded_test, y_test))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Run minimization\n", + "We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n", + "Improving accuracy\n", + "feature to remove: Property\n", + "Removed feature: Property, new relative accuracy: 0.819444\n", + "feature to remove: Other_installment_plans\n", + "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n", + "feature to remove: Job\n", + "Removed feature: Job, new relative accuracy: 0.833333\n", + "feature to remove: Housing\n", + "Removed feature: Housing, new relative accuracy: 0.833333\n", + "feature to remove: Purpose\n", + "Removed feature: Purpose, new relative accuracy: 0.916667\n", + "feature to remove: Credit_history\n", + "Removed feature: Credit_history, new relative accuracy: 0.930556\n", + "feature to remove: debtors\n", + "Removed feature: debtors, new relative accuracy: 0.944444\n", + "feature to remove: Duration_in_month\n", + "Removed feature: Duration_in_month, new relative accuracy: 1.000000\n", + "Accuracy on minimized data: 0.6666666666666666\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath('..'))\n", + "\n", + "from apt.minimization import GeneralizeToRepresentative\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# default target_accuracy is 0.998\n", + "minimizer = GeneralizeToRepresentative(model, features=features,\n", + " categorical_features=categorical_features, features_to_minimize=QI)\n", + "\n", + "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n", + "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n", + "# data it could result in a larger gap)\n", + "# Don't forget to leave a hold-out set for final validation!\n", + "X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n", + " test_size = 0.4, random_state = 38)\n", + "X_generalizer_train.reset_index(drop=True, inplace=True)\n", + "y_generalizer_train.reset_index(drop=True, inplace=True)\n", + "x_test.reset_index(drop=True, inplace=True)\n", + "y_test.reset_index(drop=True, inplace=True)\n", + "encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n", + "x_train_predictions = model.predict(encoded_generalizer_train)\n", + "minimizer.fit(X_generalizer_train, x_train_predictions)\n", + "transformed = minimizer.transform(x_test)\n", + "\n", + "encoded_transformed = preprocessor.transform(transformed)\n", + "print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test))" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Let's see what features were generalized" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ranges': {}, 'categories': {}, 'untouched': ['Purpose', 'Present_residence', 'Credit_history', 'Telephone', 'Job', 'Housing', 'Installment_rate', 'Number_of_existing_credits', 'Foreign_worker', 'Existing_checking_account', 'Other_installment_plans', 'N_people_being_liable_provide_maintenance', 'Property', 'Savings_account', 'Present_employment_since', 'Personal_status_sex', 'Duration_in_month', 'debtors', 'Credit_amount', 'Age']}\n" + ] + } + ], + "source": [ + "generalizations = minimizer.generalizations\n", + "print(generalizations)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n", + "\n", + "Let's change to a slightly lower target accuracy." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n", + "Improving accuracy\n", + "feature to remove: Property\n", + "Removed feature: Property, new relative accuracy: 0.819444\n", + "feature to remove: Other_installment_plans\n", + "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n", + "feature to remove: Job\n", + "Removed feature: Job, new relative accuracy: 0.833333\n", + "feature to remove: Housing\n", + "Removed feature: Housing, new relative accuracy: 0.833333\n", + "feature to remove: Purpose\n", + "Removed feature: Purpose, new relative accuracy: 0.916667\n", + "feature to remove: Credit_history\n", + "Removed feature: Credit_history, new relative accuracy: 0.930556\n", + "Accuracy on minimized data: 0.6416666666666667\n", + "{'ranges': {'Duration_in_month': [7.0, 8.5, 11.0, 13.0, 14.0, 18.0, 23.0, 25.5, 34.5, 47.5]}, 'categories': {'debtors': [['A101', 'A102'], ['A103']]}, 'untouched': ['Existing_checking_account', 'Savings_account', 'Present_employment_since', 'Property', 'Housing', 'Purpose', 'Personal_status_sex', 'Present_residence', 'Credit_history', 'Telephone', 'Installment_rate', 'Other_installment_plans', 'Number_of_existing_credits', 'Credit_amount', 'N_people_being_liable_provide_maintenance', 'Foreign_worker', 'Age', 'Job']}\n" + ] + } + ], + "source": [ + "# We allow a 1% deviation in accuracy from the original model accuracy\n", + "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, features=features,\n", + " categorical_features=categorical_features, features_to_minimize=QI)\n", + "\n", + "minimizer2.fit(X_generalizer_train, x_train_predictions)\n", + "transformed2 = minimizer2.transform(x_test)\n", + "\n", + "encoded_transformed2 = preprocessor.transform(transformed2)\n", + "print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test))\n", + "generalizations2 = minimizer2.generalizations\n", + "print(generalizations2)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "This time we were able to generalize two features (Duration_in_month and debtors)." + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 3ed7fa6..e6f50be 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -510,7 +510,7 @@ def test_regression(): transformed = gen.transform(x_train) print('Base model accuracy (R2 score): ', model.score(x_test, y_test)) model.fit(transformed, y_train) - print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test)) + print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test)) gener = gen.generalizations_ expexted_generalizations = {'ranges': { 'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,