mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-05-09 03:52:38 +02:00
Merge remote-tracking branch 'origin/main' into main
This commit is contained in:
commit
7e2ce7fe96
5 changed files with 671 additions and 13 deletions
|
|
@ -19,4 +19,11 @@ The following figure depicts the overall process:
|
||||||
</p>
|
</p>
|
||||||
<br />
|
<br />
|
||||||
|
|
||||||
|
Citation
|
||||||
|
--------
|
||||||
|
Goldsteen A., Ezov G., Shmelkin R., Moffie M., Farkash A. (2022) Anonymizing Machine Learning Models. In: Garcia-Alfaro
|
||||||
|
J., Muñoz-Tapia J.L., Navarro-Arribas G., Soriano M. (eds) Data Privacy Management, Cryptocurrencies and Blockchain
|
||||||
|
Technology. DPM 2021, CBT 2021. Lecture Notes in Computer Science, vol 13140. Springer, Cham.
|
||||||
|
https://doi.org/10.1007/978-3-030-93944-1_8
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,8 +37,7 @@ The current implementation supports numeric features and categorical features.
|
||||||
Start by training your machine learning model. In this example, we will use a ``DecisionTreeClassifier``, but any
|
Start by training your machine learning model. In this example, we will use a ``DecisionTreeClassifier``, but any
|
||||||
scikit-learn model can be used. We will use the iris dataset in our example.
|
scikit-learn model can be used. We will use the iris dataset in our example.
|
||||||
|
|
||||||
.. code:: python
|
```
|
||||||
|
|
||||||
from sklearn import datasets
|
from sklearn import datasets
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from sklearn.tree import DecisionTreeClassifier
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
|
@ -48,35 +47,36 @@ scikit-learn model can be used. We will use the iris dataset in our example.
|
||||||
|
|
||||||
base_est = DecisionTreeClassifier()
|
base_est = DecisionTreeClassifier()
|
||||||
base_est.fit(X_train, y_train)
|
base_est.fit(X_train, y_train)
|
||||||
|
```
|
||||||
|
|
||||||
Now create the ``GeneralizeToRepresentative`` transformer and train it. Supply it with the original model and the
|
Now create the ``GeneralizeToRepresentative`` transformer and train it. Supply it with the original model and the
|
||||||
desired target accuracy. The training process may receive the original labeled training data or the model's predictions
|
desired target accuracy. The training process may receive the original labeled training data or the model's predictions
|
||||||
on the data.
|
on the data.
|
||||||
|
|
||||||
.. code:: python
|
```
|
||||||
|
|
||||||
predictions = base_est.predict(X_train)
|
predictions = base_est.predict(X_train)
|
||||||
gen = GeneralizeToRepresentative(base_est, target_accuracy=0.9)
|
gen = GeneralizeToRepresentative(base_est, target_accuracy=0.9)
|
||||||
gen.fit(X_train, predictions)
|
gen.fit(X_train, predictions)
|
||||||
|
```
|
||||||
|
|
||||||
Now use the transformer to transform new data, for example the test data.
|
Now use the transformer to transform new data, for example the test data.
|
||||||
|
|
||||||
.. code:: python
|
```
|
||||||
|
|
||||||
transformed = gen.transform(X_test)
|
transformed = gen.transform(X_test)
|
||||||
|
```
|
||||||
|
|
||||||
The transformed data has the same columns and formats as the original data, so it can be used directly to derive
|
The transformed data has the same columns and formats as the original data, so it can be used directly to derive
|
||||||
predictions from the original model.
|
predictions from the original model.
|
||||||
|
|
||||||
.. code:: python
|
```
|
||||||
|
|
||||||
new_predictions = base_est.predict(transformed)
|
new_predictions = base_est.predict(transformed)
|
||||||
|
```
|
||||||
|
|
||||||
To export the resulting generalizations, retrieve the ``Transformer``'s ``_generalize`` parameter.
|
To export the resulting generalizations, retrieve the ``Transformer``'s ``_generalize`` parameter.
|
||||||
|
|
||||||
.. code:: python
|
```
|
||||||
|
|
||||||
generalizations = base_est._generalize
|
generalizations = base_est._generalize
|
||||||
|
```
|
||||||
|
|
||||||
The returned object has the following structure::
|
The returned object has the following structure::
|
||||||
|
|
||||||
|
|
@ -103,6 +103,10 @@ Where each value inside the range list represents a cutoff point. For example, f
|
||||||
this example are: ``<21.5, 21.5-39.0, 39.0-51.0, 51.0-70.5, >70.5``. The ``untouched`` list represents features that
|
this example are: ``<21.5, 21.5-39.0, 39.0-51.0, 51.0-70.5, >70.5``. The ``untouched`` list represents features that
|
||||||
were not generalized, i.e., their values should remain unchanged.
|
were not generalized, i.e., their values should remain unchanged.
|
||||||
|
|
||||||
|
Citation
|
||||||
|
--------
|
||||||
|
Goldsteen, A., Ezov, G., Shmelkin, R. et al. Data minimization for GDPR compliance in machine learning models. AI Ethics
|
||||||
|
(2021). https://doi.org/10.1007/s43681-021-00095-8
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
262
notebooks/minimization_diabetes_reg.ipynb
Normal file
262
notebooks/minimization_diabetes_reg.ipynb
Normal file
|
|
@ -0,0 +1,262 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%% md\n"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# Applying data minimization to a trained regression ML model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"In this tutorial we will show how to perform data minimization for regression ML models using the minimization module.\n",
|
||||||
|
"\n",
|
||||||
|
"We will show you applying data minimization to a different trained regression models."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Load data\n",
|
||||||
|
"QI parameter determines which features will be minimized."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%% md\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 54,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.datasets import load_diabetes\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"dataset = load_diabetes()\n",
|
||||||
|
"X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)\n",
|
||||||
|
"\n",
|
||||||
|
"features = ['age', 'sex', 'bmi', 'bp',\n",
|
||||||
|
" 's1', 's2', 's3', 's4', 's5', 's6']\n",
|
||||||
|
"QI = [0, 2, 5, 8, 9]"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Train DecisionTreeRegressor model"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 55,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Base model accuracy (R2 score): 0.15014421352446072\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from apt.minimization import GeneralizeToRepresentative\n",
|
||||||
|
"from sklearn.tree import DecisionTreeRegressor\n",
|
||||||
|
"\n",
|
||||||
|
"model1 = DecisionTreeRegressor(random_state=10, min_samples_split=2)\n",
|
||||||
|
"model1.fit(X_train, y_train)\n",
|
||||||
|
"print('Base model accuracy (R2 score): ', model1.score(X_test, y_test))"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Run minimization\n",
|
||||||
|
"We will try to run minimization with only a subset of the features."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 56,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.108922\n",
|
||||||
|
"Improving accuracy\n",
|
||||||
|
"feature to remove: s5\n",
|
||||||
|
"Removed feature: s5, new relative accuracy: 0.505498\n",
|
||||||
|
"feature to remove: s6\n",
|
||||||
|
"Removed feature: s6, new relative accuracy: 0.404757\n",
|
||||||
|
"feature to remove: bmi\n",
|
||||||
|
"Removed feature: bmi, new relative accuracy: 0.718978\n",
|
||||||
|
"Accuracy on minimized data: 0.11604533946025941\n",
|
||||||
|
"generalizations: {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 'bmi', 's6', 'bp', 's4', 's5', 'sex', 's1']}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# note that is_regression param is True\n",
|
||||||
|
"\n",
|
||||||
|
"minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, features=features, is_regression=True,\n",
|
||||||
|
" features_to_minimize=QI)\n",
|
||||||
|
"\n",
|
||||||
|
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
|
||||||
|
"# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
|
||||||
|
"# data it could result in a larger gap)\n",
|
||||||
|
"# Don't forget to leave a hold-out set for final validation!\n",
|
||||||
|
"X_generalizer_train1, x_test1, y_generalizer_train1, y_test1 = train_test_split(X_test, y_test,\n",
|
||||||
|
" test_size = 0.4, random_state = 38)\n",
|
||||||
|
"\n",
|
||||||
|
"x_train_predictions1 = model1.predict(X_generalizer_train1)\n",
|
||||||
|
"minimizer1.fit(X_generalizer_train1, x_train_predictions1)\n",
|
||||||
|
"transformed1 = minimizer1.transform(x_test1)\n",
|
||||||
|
"print('Accuracy on minimized data: ', model1.score(transformed1, y_test1))\n",
|
||||||
|
"print('generalizations: ',minimizer1.generalizations_)#%% md"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Train linear regression model"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%% md\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"from sklearn.linear_model import LinearRegression\n",
|
||||||
|
"from apt.minimization import GeneralizeToRepresentative\n",
|
||||||
|
"\n",
|
||||||
|
"model2 = LinearRegression()\n",
|
||||||
|
"model2.fit(X_train, y_train)\n",
|
||||||
|
"print('Base model accuracy (R2 score): ', model2.score(X_test, y_test))"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Run minimization\n",
|
||||||
|
"We will try to run minimization with only a subset of the features."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 58,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.225782\n",
|
||||||
|
"Improving accuracy\n",
|
||||||
|
"feature to remove: age\n",
|
||||||
|
"Removed feature: age, new relative accuracy: 0.223565\n",
|
||||||
|
"feature to remove: s2\n",
|
||||||
|
"Removed feature: s2, new relative accuracy: 0.759788\n",
|
||||||
|
"Accuracy on minimized data: 0.4414329261774286\n",
|
||||||
|
"generalizations: {'ranges': {'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.04049498960375786, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, 0.0015758189256303012, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025, 0.06386702693998814], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.0383566590026021, -0.02800139266764745, -0.021788232028484344, -0.007290858076885343, -0.007290857844054699, 0.017561784014105797, 0.02377494378015399, 0.02791705122217536, 0.02998810407007113, 0.054840744473040104]}, 'categories': {}, 'untouched': ['s2', 's3', 'bp', 's4', 'age', 'sex', 's1']}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# note that is_regression param is True\n",
|
||||||
|
"\n",
|
||||||
|
"minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, features=features, is_regression=True,\n",
|
||||||
|
" features_to_minimize=QI)\n",
|
||||||
|
"\n",
|
||||||
|
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
|
||||||
|
"# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
|
||||||
|
"# data it could result in a larger gap)\n",
|
||||||
|
"# Don't forget to leave a hold-out set for final validation!\n",
|
||||||
|
"X_generalizer_train2, x_test2, y_generalizer_train2, y_test2 = train_test_split(X_test, y_test,\n",
|
||||||
|
" test_size = 0.4, random_state = 38)\n",
|
||||||
|
"\n",
|
||||||
|
"x_train_predictions2 = model2.predict(X_generalizer_train2)\n",
|
||||||
|
"minimizer2.fit(X_generalizer_train2, x_train_predictions2)\n",
|
||||||
|
"transformed2 = minimizer2.transform(x_test2)\n",
|
||||||
|
"print('Accuracy on minimized data: ', model2.score(transformed2, y_test2))\n",
|
||||||
|
"print('generalizations: ',minimizer2.generalizations_)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 2
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython2",
|
||||||
|
"version": "2.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
||||||
385
notebooks/minimization_german_credit.ipynb
Normal file
385
notebooks/minimization_german_credit.ipynb
Normal file
|
|
@ -0,0 +1,385 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# Applying data minimization with categorical data and only a subset of the features to a trained ML model"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
|
||||||
|
"\n",
|
||||||
|
"This will be demonstarted using the German Credit dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data)."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Load data\n",
|
||||||
|
"QI parameter determines which features will be minimized."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" Existing_checking_account Duration_in_month Credit_history Purpose \\\n",
|
||||||
|
"0 A14 24 A32 A41 \n",
|
||||||
|
"1 A14 33 A33 A49 \n",
|
||||||
|
"2 A11 9 A32 A42 \n",
|
||||||
|
"3 A14 28 A34 A43 \n",
|
||||||
|
"4 A11 24 A33 A43 \n",
|
||||||
|
".. ... ... ... ... \n",
|
||||||
|
"695 A14 12 A32 A43 \n",
|
||||||
|
"696 A14 13 A32 A43 \n",
|
||||||
|
"697 A11 48 A30 A41 \n",
|
||||||
|
"698 A12 21 A34 A42 \n",
|
||||||
|
"699 A13 15 A32 A46 \n",
|
||||||
|
"\n",
|
||||||
|
" Credit_amount Savings_account Present_employment_since Installment_rate \\\n",
|
||||||
|
"0 7814 A61 A74 3 \n",
|
||||||
|
"1 2764 A61 A73 2 \n",
|
||||||
|
"2 2136 A61 A73 3 \n",
|
||||||
|
"3 2743 A61 A75 4 \n",
|
||||||
|
"4 1659 A61 A72 4 \n",
|
||||||
|
".. ... ... ... ... \n",
|
||||||
|
"695 1963 A61 A74 4 \n",
|
||||||
|
"696 1409 A62 A71 2 \n",
|
||||||
|
"697 4605 A61 A75 3 \n",
|
||||||
|
"698 2745 A64 A74 3 \n",
|
||||||
|
"699 1905 A61 A75 4 \n",
|
||||||
|
"\n",
|
||||||
|
" Personal_status_sex debtors Present_residence Property Age \\\n",
|
||||||
|
"0 A93 A101 3 A123 38 \n",
|
||||||
|
"1 A92 A101 2 A123 26 \n",
|
||||||
|
"2 A93 A101 2 A121 25 \n",
|
||||||
|
"3 A93 A101 2 A123 29 \n",
|
||||||
|
"4 A92 A101 2 A123 29 \n",
|
||||||
|
".. ... ... ... ... ... \n",
|
||||||
|
"695 A93 A101 2 A123 31 \n",
|
||||||
|
"696 A92 A101 4 A121 64 \n",
|
||||||
|
"697 A93 A101 4 A124 24 \n",
|
||||||
|
"698 A93 A101 2 A123 32 \n",
|
||||||
|
"699 A93 A101 4 A123 40 \n",
|
||||||
|
"\n",
|
||||||
|
" Other_installment_plans Housing Number_of_existing_credits Job \\\n",
|
||||||
|
"0 A143 A152 1 A174 \n",
|
||||||
|
"1 A143 A152 2 A173 \n",
|
||||||
|
"2 A143 A152 1 A173 \n",
|
||||||
|
"3 A143 A152 2 A173 \n",
|
||||||
|
"4 A143 A151 1 A172 \n",
|
||||||
|
".. ... ... ... ... \n",
|
||||||
|
"695 A143 A151 2 A174 \n",
|
||||||
|
"696 A143 A152 1 A173 \n",
|
||||||
|
"697 A143 A153 2 A173 \n",
|
||||||
|
"698 A143 A152 2 A173 \n",
|
||||||
|
"699 A143 A151 1 A174 \n",
|
||||||
|
"\n",
|
||||||
|
" N_people_being_liable_provide_maintenance Telephone Foreign_worker \n",
|
||||||
|
"0 1 1 1 \n",
|
||||||
|
"1 1 1 1 \n",
|
||||||
|
"2 1 0 1 \n",
|
||||||
|
"3 1 0 1 \n",
|
||||||
|
"4 1 1 1 \n",
|
||||||
|
".. ... ... ... \n",
|
||||||
|
"695 2 1 1 \n",
|
||||||
|
"696 1 0 1 \n",
|
||||||
|
"697 2 0 1 \n",
|
||||||
|
"698 1 1 1 \n",
|
||||||
|
"699 1 1 1 \n",
|
||||||
|
"\n",
|
||||||
|
"[700 rows x 20 columns]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from apt.utils import get_german_credit_dataset\n",
|
||||||
|
"\n",
|
||||||
|
"(x_train, y_train), (x_test, y_test) = get_german_credit_dataset()\n",
|
||||||
|
"features = [\"Existing_checking_account\", \"Duration_in_month\", \"Credit_history\", \"Purpose\", \"Credit_amount\",\n",
|
||||||
|
" \"Savings_account\", \"Present_employment_since\", \"Installment_rate\", \"Personal_status_sex\", \"debtors\",\n",
|
||||||
|
" \"Present_residence\", \"Property\", \"Age\", \"Other_installment_plans\", \"Housing\",\n",
|
||||||
|
" \"Number_of_existing_credits\", \"Job\", \"N_people_being_liable_provide_maintenance\", \"Telephone\",\n",
|
||||||
|
" \"Foreign_worker\"]\n",
|
||||||
|
"categorical_features = [\"Existing_checking_account\", \"Credit_history\", \"Purpose\", \"Savings_account\",\n",
|
||||||
|
" \"Present_employment_since\", \"Personal_status_sex\", \"debtors\", \"Property\",\n",
|
||||||
|
" \"Other_installment_plans\", \"Housing\", \"Job\"]\n",
|
||||||
|
"QI = [\"Duration_in_month\", \"Credit_history\", \"Purpose\", \"debtors\", \"Property\", \"Other_installment_plans\",\n",
|
||||||
|
" \"Housing\", \"Job\"]\n",
|
||||||
|
"\n",
|
||||||
|
"print(x_train)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Train decision tree model\n",
|
||||||
|
"we use OneHotEncoder to handle categorical features."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Base model accuracy: 0.7033333333333334\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.compose import ColumnTransformer\n",
|
||||||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||||
|
"from sklearn.impute import SimpleImputer\n",
|
||||||
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||||
|
"numeric_features = [f for f in features if f not in categorical_features]\n",
|
||||||
|
"numeric_transformer = Pipeline(\n",
|
||||||
|
" steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
|
||||||
|
")\n",
|
||||||
|
"categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
|
||||||
|
"preprocessor = ColumnTransformer(\n",
|
||||||
|
" transformers=[\n",
|
||||||
|
" (\"num\", numeric_transformer, numeric_features),\n",
|
||||||
|
" (\"cat\", categorical_transformer, categorical_features),\n",
|
||||||
|
" ]\n",
|
||||||
|
")\n",
|
||||||
|
"encoded_train = preprocessor.fit_transform(x_train)\n",
|
||||||
|
"model = DecisionTreeClassifier()\n",
|
||||||
|
"model.fit(encoded_train, y_train)\n",
|
||||||
|
"\n",
|
||||||
|
"encoded_test = preprocessor.transform(x_test)\n",
|
||||||
|
"print('Base model accuracy: ', model.score(encoded_test, y_test))"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Run minimization\n",
|
||||||
|
"We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
|
||||||
|
"Improving accuracy\n",
|
||||||
|
"feature to remove: Property\n",
|
||||||
|
"Removed feature: Property, new relative accuracy: 0.819444\n",
|
||||||
|
"feature to remove: Other_installment_plans\n",
|
||||||
|
"Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
|
||||||
|
"feature to remove: Job\n",
|
||||||
|
"Removed feature: Job, new relative accuracy: 0.833333\n",
|
||||||
|
"feature to remove: Housing\n",
|
||||||
|
"Removed feature: Housing, new relative accuracy: 0.833333\n",
|
||||||
|
"feature to remove: Purpose\n",
|
||||||
|
"Removed feature: Purpose, new relative accuracy: 0.916667\n",
|
||||||
|
"feature to remove: Credit_history\n",
|
||||||
|
"Removed feature: Credit_history, new relative accuracy: 0.930556\n",
|
||||||
|
"feature to remove: debtors\n",
|
||||||
|
"Removed feature: debtors, new relative accuracy: 0.944444\n",
|
||||||
|
"feature to remove: Duration_in_month\n",
|
||||||
|
"Removed feature: Duration_in_month, new relative accuracy: 1.000000\n",
|
||||||
|
"Accuracy on minimized data: 0.6666666666666666\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import sys\n",
|
||||||
|
"import os\n",
|
||||||
|
"sys.path.insert(0, os.path.abspath('..'))\n",
|
||||||
|
"\n",
|
||||||
|
"from apt.minimization import GeneralizeToRepresentative\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"# default target_accuracy is 0.998\n",
|
||||||
|
"minimizer = GeneralizeToRepresentative(model, features=features,\n",
|
||||||
|
" categorical_features=categorical_features, features_to_minimize=QI)\n",
|
||||||
|
"\n",
|
||||||
|
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
|
||||||
|
"# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
|
||||||
|
"# data it could result in a larger gap)\n",
|
||||||
|
"# Don't forget to leave a hold-out set for final validation!\n",
|
||||||
|
"X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
|
||||||
|
" test_size = 0.4, random_state = 38)\n",
|
||||||
|
"X_generalizer_train.reset_index(drop=True, inplace=True)\n",
|
||||||
|
"y_generalizer_train.reset_index(drop=True, inplace=True)\n",
|
||||||
|
"x_test.reset_index(drop=True, inplace=True)\n",
|
||||||
|
"y_test.reset_index(drop=True, inplace=True)\n",
|
||||||
|
"encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
|
||||||
|
"x_train_predictions = model.predict(encoded_generalizer_train)\n",
|
||||||
|
"minimizer.fit(X_generalizer_train, x_train_predictions)\n",
|
||||||
|
"transformed = minimizer.transform(x_test)\n",
|
||||||
|
"\n",
|
||||||
|
"encoded_transformed = preprocessor.transform(transformed)\n",
|
||||||
|
"print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test))"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"#### Let's see what features were generalized"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'ranges': {}, 'categories': {}, 'untouched': ['Purpose', 'Present_residence', 'Credit_history', 'Telephone', 'Job', 'Housing', 'Installment_rate', 'Number_of_existing_credits', 'Foreign_worker', 'Existing_checking_account', 'Other_installment_plans', 'N_people_being_liable_provide_maintenance', 'Property', 'Savings_account', 'Present_employment_since', 'Personal_status_sex', 'Duration_in_month', 'debtors', 'Credit_amount', 'Age']}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"generalizations = minimizer.generalizations\n",
|
||||||
|
"print(generalizations)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
|
||||||
|
"\n",
|
||||||
|
"Let's change to a slightly lower target accuracy."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
|
||||||
|
"Improving accuracy\n",
|
||||||
|
"feature to remove: Property\n",
|
||||||
|
"Removed feature: Property, new relative accuracy: 0.819444\n",
|
||||||
|
"feature to remove: Other_installment_plans\n",
|
||||||
|
"Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
|
||||||
|
"feature to remove: Job\n",
|
||||||
|
"Removed feature: Job, new relative accuracy: 0.833333\n",
|
||||||
|
"feature to remove: Housing\n",
|
||||||
|
"Removed feature: Housing, new relative accuracy: 0.833333\n",
|
||||||
|
"feature to remove: Purpose\n",
|
||||||
|
"Removed feature: Purpose, new relative accuracy: 0.916667\n",
|
||||||
|
"feature to remove: Credit_history\n",
|
||||||
|
"Removed feature: Credit_history, new relative accuracy: 0.930556\n",
|
||||||
|
"Accuracy on minimized data: 0.6416666666666667\n",
|
||||||
|
"{'ranges': {'Duration_in_month': [7.0, 8.5, 11.0, 13.0, 14.0, 18.0, 23.0, 25.5, 34.5, 47.5]}, 'categories': {'debtors': [['A101', 'A102'], ['A103']]}, 'untouched': ['Existing_checking_account', 'Savings_account', 'Present_employment_since', 'Property', 'Housing', 'Purpose', 'Personal_status_sex', 'Present_residence', 'Credit_history', 'Telephone', 'Installment_rate', 'Other_installment_plans', 'Number_of_existing_credits', 'Credit_amount', 'N_people_being_liable_provide_maintenance', 'Foreign_worker', 'Age', 'Job']}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# We allow a 1% deviation in accuracy from the original model accuracy\n",
|
||||||
|
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, features=features,\n",
|
||||||
|
" categorical_features=categorical_features, features_to_minimize=QI)\n",
|
||||||
|
"\n",
|
||||||
|
"minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
|
||||||
|
"transformed2 = minimizer2.transform(x_test)\n",
|
||||||
|
"\n",
|
||||||
|
"encoded_transformed2 = preprocessor.transform(transformed2)\n",
|
||||||
|
"print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test))\n",
|
||||||
|
"generalizations2 = minimizer2.generalizations\n",
|
||||||
|
"print(generalizations2)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"This time we were able to generalize two features (Duration_in_month and debtors)."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 2
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython2",
|
||||||
|
"version": "2.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
||||||
|
|
@ -510,7 +510,7 @@ def test_regression():
|
||||||
transformed = gen.transform(x_train)
|
transformed = gen.transform(x_train)
|
||||||
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
|
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
|
||||||
model.fit(transformed, y_train)
|
model.fit(transformed, y_train)
|
||||||
print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
|
print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test))
|
||||||
gener = gen.generalizations_
|
gener = gen.generalizations_
|
||||||
expexted_generalizations = {'ranges': {
|
expexted_generalizations = {'ranges': {
|
||||||
'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
|
'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue