mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-24 20:36:21 +02:00
* keras wrapper + blackbox classifier wrapper (fix #7) * fix error in NCP calculation * Update notebooks * Fix #25 (incorrect attack_feature indexes for social feature in notebook) * Consistent naming of internal parameters
253 lines
9.5 KiB
Text
253 lines
9.5 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"pycharm": {
|
|
"name": "#%% md\n"
|
|
}
|
|
},
|
|
"source": [
|
|
"# Applying data minimization to a trained regression ML model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"In this tutorial we will show how to perform data minimization for regression ML models using the minimization module.\n",
|
|
"\n",
|
|
"We will show you applying data minimization to a different trained regression models."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%% md\n"
|
|
}
|
|
},
|
|
"source": [
|
|
"## Load data\n",
|
|
"QI parameter determines which features will be minimized."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.datasets import load_diabetes\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"dataset = load_diabetes()\n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)\n",
|
|
"\n",
|
|
"features = ['age', 'sex', 'bmi', 'bp',\n",
|
|
" 's1', 's2', 's3', 's4', 's5', 's6']\n",
|
|
"QI = ['age', 'bmi', 's2', 's5', 's6']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Train DecisionTreeRegressor model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Base model accuracy (R2 score): 0.15014421352446072\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from apt.minimization import GeneralizeToRepresentative\n",
|
|
"from sklearn.tree import DecisionTreeRegressor\n",
|
|
"\n",
|
|
"model1 = DecisionTreeRegressor(random_state=10, min_samples_split=2)\n",
|
|
"model1.fit(X_train, y_train)\n",
|
|
"print('Base model accuracy (R2 score): ', model1.score(X_test, y_test))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Run minimization\n",
|
|
"We will try to run minimization with only a subset of the features."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.108922\n",
|
|
"Improving accuracy\n",
|
|
"feature to remove: s5\n",
|
|
"Removed feature: s5, new relative accuracy: 0.505498\n",
|
|
"feature to remove: s6\n",
|
|
"Removed feature: s6, new relative accuracy: 0.404757\n",
|
|
"feature to remove: bmi\n",
|
|
"Removed feature: bmi, new relative accuracy: 0.718978\n",
|
|
"Accuracy on minimized data: 0.11604533946025941\n",
|
|
"generalizations: {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 's6', 's5', 'bp', 'bmi', 's4', 's1', 'sex']}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# note that is_regression param is True\n",
|
|
"\n",
|
|
"minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, is_regression=True,\n",
|
|
" features_to_minimize=QI)\n",
|
|
"\n",
|
|
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
|
|
"# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
|
|
"# data it could result in a larger gap)\n",
|
|
"# Don't forget to leave a hold-out set for final validation!\n",
|
|
"X_generalizer_train1, x_test1, y_generalizer_train1, y_test1 = train_test_split(X_test, y_test,\n",
|
|
" test_size = 0.4, random_state = 38)\n",
|
|
"\n",
|
|
"x_train_predictions1 = model1.predict(X_generalizer_train1)\n",
|
|
"minimizer1.fit(X_generalizer_train1, x_train_predictions1, features_names=features)\n",
|
|
"transformed1 = minimizer1.transform(x_test1, features_names=features)\n",
|
|
"print('Accuracy on minimized data: ', model1.score(transformed1, y_test1))\n",
|
|
"print('generalizations: ',minimizer1.generalizations_)#%% md"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%% md\n"
|
|
}
|
|
},
|
|
"source": [
|
|
"## Train linear regression model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Base model accuracy (R2 score): 0.5080618258593723\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.linear_model import LinearRegression\n",
|
|
"from apt.minimization import GeneralizeToRepresentative\n",
|
|
"\n",
|
|
"model2 = LinearRegression()\n",
|
|
"model2.fit(X_train, y_train)\n",
|
|
"print('Base model accuracy (R2 score): ', model2.score(X_test, y_test))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Run minimization\n",
|
|
"We will try to run minimization with only a subset of the features."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.355377\n",
|
|
"Improving accuracy\n",
|
|
"feature to remove: s2\n",
|
|
"Removed feature: s2, new relative accuracy: 0.773233\n",
|
|
"Accuracy on minimized data: 0.3945625296515525\n",
|
|
"generalizations: {'ranges': {'age': [-0.06181889958679676, -0.027309785597026348, -0.012779631884768605, -0.0036982858437113464, -0.001882016658782959, 0.0035667913034558296, 0.01991321425884962, 0.021729483967646956, 0.02717829099856317, 0.04534098319709301, 0.05805486813187599], 'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.04876246117055416, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, -0.0048803192912600935, 0.0002040128456428647, 0.0015758189256303012, 0.008132445393130183, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.044569820165634155, -0.0383566590026021, -0.021788232028484344, -0.017646125052124262, -0.013504017610102892, 0.02377494378015399, 0.06519601307809353, 0.08383549377322197]}, 'categories': {}, 'untouched': ['s3', 's2', 'bp', 's4', 's1', 'sex']}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# note that is_regression param is True\n",
|
|
"\n",
|
|
"minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, is_regression=True,\n",
|
|
" features_to_minimize=QI)\n",
|
|
"\n",
|
|
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
|
|
"# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
|
|
"# data it could result in a larger gap)\n",
|
|
"# Don't forget to leave a hold-out set for final validation!\n",
|
|
"X_generalizer_train2, x_test2, y_generalizer_train2, y_test2 = train_test_split(X_test, y_test,\n",
|
|
" test_size = 0.4, random_state = 38)\n",
|
|
"\n",
|
|
"x_train_predictions2 = model2.predict(X_generalizer_train2)\n",
|
|
"minimizer2.fit(X_generalizer_train2, x_train_predictions2, features_names=features)\n",
|
|
"transformed2 = minimizer2.transform(x_test2, features_names=features)\n",
|
|
"print('Accuracy on minimized data: ', model2.score(transformed2, y_test2))\n",
|
|
"print('generalizations: ',minimizer2.generalizations_)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|