mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-24 20:36:21 +02:00
* Limit scikit-learn versions between 0.22.2 and 1.1.3, remove deprecated load_boston(). * Set pytest configuration option to show test progress in detail. * Change np.int to int according to DeprecationWarning Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
267 lines
No EOL
8.9 KiB
Text
267 lines
No EOL
8.9 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Applying data minimization to a trained ML model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"In this tutorial we will show how to perform data minimization for ML models using the minimization module. \n",
|
|
"\n",
|
|
"This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/adult). \n",
|
|
"\n",
|
|
"We use only the numerical features in the dataset because this is what is currently supported by the module."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Load data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[[3.9000e+01 1.3000e+01 2.1740e+03 0.0000e+00 4.0000e+01]\n",
|
|
" [5.0000e+01 1.3000e+01 0.0000e+00 0.0000e+00 1.3000e+01]\n",
|
|
" [3.8000e+01 9.0000e+00 0.0000e+00 0.0000e+00 4.0000e+01]\n",
|
|
" ...\n",
|
|
" [5.8000e+01 9.0000e+00 0.0000e+00 0.0000e+00 4.0000e+01]\n",
|
|
" [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n",
|
|
" [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
|
|
"x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
|
|
" usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n",
|
|
"\n",
|
|
"y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
|
|
" usecols=14, dtype=str, delimiter=\", \")\n",
|
|
"\n",
|
|
"\n",
|
|
"x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
|
|
" usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n",
|
|
"\n",
|
|
"y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
|
|
" usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n",
|
|
"\n",
|
|
"# Trim trailing period \".\" from label\n",
|
|
"y_test = np.array([a[:-1] for a in y_test])\n",
|
|
"\n",
|
|
"y_train[y_train == '<=50K'] = 0\n",
|
|
"y_train[y_train == '>50K'] = 1\n",
|
|
"y_train = y_train.astype(int)\n",
|
|
"\n",
|
|
"y_test[y_test == '<=50K'] = 0\n",
|
|
"y_test[y_test == '>50K'] = 1\n",
|
|
"y_test = y_test.astype(int)\n",
|
|
"\n",
|
|
"print(x_train)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Train decision tree model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Base model accuracy: 0.8190528837295007\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import os\n",
|
|
"import sys\n",
|
|
"sys.path.insert(0, os.path.abspath('..'))\n",
|
|
"\n",
|
|
"from apt.utils.datasets import ArrayDataset\n",
|
|
"from apt.utils.models import SklearnClassifier, ModelOutputType\n",
|
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
|
"\n",
|
|
"base_est = DecisionTreeClassifier()\n",
|
|
"model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)\n",
|
|
"model.fit(ArrayDataset(x_train, y_train))\n",
|
|
"\n",
|
|
"print('Base model accuracy: ', model.score(ArrayDataset(x_test, y_test)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Run minimization\n",
|
|
"We will try to run minimization with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.920665\n",
|
|
"Improving accuracy\n",
|
|
"feature to remove: 1\n",
|
|
"Removed feature: 1, new relative accuracy: 0.920026\n",
|
|
"feature to remove: 0\n",
|
|
"Removed feature: 0, new relative accuracy: 0.938580\n",
|
|
"feature to remove: 4\n",
|
|
"Removed feature: 4, new relative accuracy: 0.987204\n",
|
|
"feature to remove: 2\n",
|
|
"Removed feature: 2, new relative accuracy: 0.992962\n",
|
|
"feature to remove: 3\n",
|
|
"Removed feature: 3, new relative accuracy: 1.000000\n",
|
|
"Accuracy on minimized data: 0.8165771297006907\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from apt.minimization import GeneralizeToRepresentative\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"# default target_accuracy is 0.998\n",
|
|
"minimizer = GeneralizeToRepresentative(model)\n",
|
|
"\n",
|
|
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the \n",
|
|
"# resulting accuracy on test data will be closer to the desired target accuracy (when working with training \n",
|
|
"# data it could result in a larger gap)\n",
|
|
"# Don't forget to leave a hold-out set for final validation!\n",
|
|
"X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
|
|
" test_size = 0.4, random_state = 38)\n",
|
|
"x_train_predictions = model.predict(ArrayDataset(X_generalizer_train))\n",
|
|
"if x_train_predictions.shape[1] > 1:\n",
|
|
" x_train_predictions = np.argmax(x_train_predictions, axis=1)\n",
|
|
"minimizer.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
|
|
"transformed = minimizer.transform(dataset=ArrayDataset(x_test))\n",
|
|
"\n",
|
|
"print('Accuracy on minimized data: ', model.score(ArrayDataset(transformed, y_test)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Let's see what features were generalized"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'ranges': {}, 'categories': {}, 'untouched': ['2', '4', '3', '1', '0']}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"generalizations = minimizer.generalizations\n",
|
|
"print(generalizations)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
|
|
"\n",
|
|
"Let's change to a slightly lower target accuracy."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.920665\n",
|
|
"Improving accuracy\n",
|
|
"feature to remove: 1\n",
|
|
"Removed feature: 1, new relative accuracy: 0.920026\n",
|
|
"feature to remove: 0\n",
|
|
"Removed feature: 0, new relative accuracy: 0.938580\n",
|
|
"feature to remove: 4\n",
|
|
"Removed feature: 4, new relative accuracy: 0.987204\n",
|
|
"feature to remove: 2\n",
|
|
"Removed feature: 2, new relative accuracy: 0.992962\n",
|
|
"Accuracy on minimized data: 0.8100537221795856\n",
|
|
"{'ranges': {'3': [704.0, 782.0, 870.0, 951.0, 1588.0, 1647.5, 1684.0, 1805.0, 1923.0, 2168.5]}, 'categories': {}, 'untouched': ['2', '4', '1', '0']}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# We allow a 1% deviation in accuracy from the original model accuracy\n",
|
|
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.99)\n",
|
|
"\n",
|
|
"minimizer2.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
|
|
"transformed2 = minimizer2.transform(dataset=ArrayDataset(x_test))\n",
|
|
"print('Accuracy on minimized data: ', model.score(test_data=ArrayDataset(transformed2, y_test)))\n",
|
|
"generalizations2 = minimizer2.generalizations\n",
|
|
"print(generalizations2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This time we were able to generalize one feature, feature number 3 (capital-loss)."
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
} |