mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-27 05:46:22 +02:00
* keras wrapper + blackbox classifier wrapper (fix #7) * fix error in NCP calculation * Update notebooks * Fix #25 (incorrect attack_feature indexes for social feature in notebook) * Consistent naming of internal parameters
368 lines
16 KiB
Text
368 lines
16 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Applying data minimization with categorical data and only a subset of the features to a trained ML model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
|
|
"\n",
|
|
"This will be demonstarted using the German Credit dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data)."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Load data\n",
|
|
"QI parameter determines which features will be minimized."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Existing_checking_account Duration_in_month Credit_history Purpose \\\n",
|
|
"0 A14 24 A32 A41 \n",
|
|
"1 A14 33 A33 A49 \n",
|
|
"2 A11 9 A32 A42 \n",
|
|
"3 A14 28 A34 A43 \n",
|
|
"4 A11 24 A33 A43 \n",
|
|
".. ... ... ... ... \n",
|
|
"695 A14 12 A32 A43 \n",
|
|
"696 A14 13 A32 A43 \n",
|
|
"697 A11 48 A30 A41 \n",
|
|
"698 A12 21 A34 A42 \n",
|
|
"699 A13 15 A32 A46 \n",
|
|
"\n",
|
|
" Credit_amount Savings_account Present_employment_since Installment_rate \\\n",
|
|
"0 7814 A61 A74 3 \n",
|
|
"1 2764 A61 A73 2 \n",
|
|
"2 2136 A61 A73 3 \n",
|
|
"3 2743 A61 A75 4 \n",
|
|
"4 1659 A61 A72 4 \n",
|
|
".. ... ... ... ... \n",
|
|
"695 1963 A61 A74 4 \n",
|
|
"696 1409 A62 A71 2 \n",
|
|
"697 4605 A61 A75 3 \n",
|
|
"698 2745 A64 A74 3 \n",
|
|
"699 1905 A61 A75 4 \n",
|
|
"\n",
|
|
" Personal_status_sex debtors Present_residence Property Age \\\n",
|
|
"0 A93 A101 3 A123 38 \n",
|
|
"1 A92 A101 2 A123 26 \n",
|
|
"2 A93 A101 2 A121 25 \n",
|
|
"3 A93 A101 2 A123 29 \n",
|
|
"4 A92 A101 2 A123 29 \n",
|
|
".. ... ... ... ... ... \n",
|
|
"695 A93 A101 2 A123 31 \n",
|
|
"696 A92 A101 4 A121 64 \n",
|
|
"697 A93 A101 4 A124 24 \n",
|
|
"698 A93 A101 2 A123 32 \n",
|
|
"699 A93 A101 4 A123 40 \n",
|
|
"\n",
|
|
" Other_installment_plans Housing Number_of_existing_credits Job \\\n",
|
|
"0 A143 A152 1 A174 \n",
|
|
"1 A143 A152 2 A173 \n",
|
|
"2 A143 A152 1 A173 \n",
|
|
"3 A143 A152 2 A173 \n",
|
|
"4 A143 A151 1 A172 \n",
|
|
".. ... ... ... ... \n",
|
|
"695 A143 A151 2 A174 \n",
|
|
"696 A143 A152 1 A173 \n",
|
|
"697 A143 A153 2 A173 \n",
|
|
"698 A143 A152 2 A173 \n",
|
|
"699 A143 A151 1 A174 \n",
|
|
"\n",
|
|
" N_people_being_liable_provide_maintenance Telephone Foreign_worker \n",
|
|
"0 1 1 1 \n",
|
|
"1 1 1 1 \n",
|
|
"2 1 0 1 \n",
|
|
"3 1 0 1 \n",
|
|
"4 1 1 1 \n",
|
|
".. ... ... ... \n",
|
|
"695 2 1 1 \n",
|
|
"696 1 0 1 \n",
|
|
"697 2 0 1 \n",
|
|
"698 1 1 1 \n",
|
|
"699 1 1 1 \n",
|
|
"\n",
|
|
"[700 rows x 20 columns]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import os\n",
|
|
"import sys\n",
|
|
"sys.path.insert(0, os.path.abspath('..'))\n",
|
|
"\n",
|
|
"from apt.utils.dataset_utils import get_german_credit_dataset_pd\n",
|
|
"\n",
|
|
"(x_train, y_train), (x_test, y_test) = get_german_credit_dataset_pd()\n",
|
|
"features = [\"Existing_checking_account\", \"Duration_in_month\", \"Credit_history\", \"Purpose\", \"Credit_amount\",\n",
|
|
" \"Savings_account\", \"Present_employment_since\", \"Installment_rate\", \"Personal_status_sex\", \"debtors\",\n",
|
|
" \"Present_residence\", \"Property\", \"Age\", \"Other_installment_plans\", \"Housing\",\n",
|
|
" \"Number_of_existing_credits\", \"Job\", \"N_people_being_liable_provide_maintenance\", \"Telephone\",\n",
|
|
" \"Foreign_worker\"]\n",
|
|
"categorical_features = [\"Existing_checking_account\", \"Credit_history\", \"Purpose\", \"Savings_account\",\n",
|
|
" \"Present_employment_since\", \"Personal_status_sex\", \"debtors\", \"Property\",\n",
|
|
" \"Other_installment_plans\", \"Housing\", \"Job\"]\n",
|
|
"QI = [\"Duration_in_month\", \"Credit_history\", \"Purpose\", \"debtors\", \"Property\", \"Other_installment_plans\",\n",
|
|
" \"Housing\", \"Job\"]\n",
|
|
"\n",
|
|
"print(x_train)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Train decision tree model\n",
|
|
"we use OneHotEncoder to handle categorical features."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Base model accuracy: 0.6933333333333334\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|
"from sklearn.impute import SimpleImputer\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
|
"numeric_features = [f for f in features if f not in categorical_features]\n",
|
|
"numeric_transformer = Pipeline(\n",
|
|
" steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
|
|
")\n",
|
|
"categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
|
|
"preprocessor = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" (\"num\", numeric_transformer, numeric_features),\n",
|
|
" (\"cat\", categorical_transformer, categorical_features),\n",
|
|
" ]\n",
|
|
")\n",
|
|
"encoded_train = preprocessor.fit_transform(x_train)\n",
|
|
"model = DecisionTreeClassifier()\n",
|
|
"model.fit(encoded_train, y_train)\n",
|
|
"\n",
|
|
"encoded_test = preprocessor.transform(x_test)\n",
|
|
"print('Base model accuracy: ', model.score(encoded_test, y_test))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Run minimization\n",
|
|
"We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.805556\n",
|
|
"Improving accuracy\n",
|
|
"feature to remove: Credit_history\n",
|
|
"Removed feature: Credit_history, new relative accuracy: 0.819444\n",
|
|
"feature to remove: Other_installment_plans\n",
|
|
"Removed feature: Other_installment_plans, new relative accuracy: 0.847222\n",
|
|
"feature to remove: Duration_in_month\n",
|
|
"Removed feature: Duration_in_month, new relative accuracy: 0.847222\n",
|
|
"feature to remove: Property\n",
|
|
"Removed feature: Property, new relative accuracy: 0.847222\n",
|
|
"feature to remove: Housing\n",
|
|
"Removed feature: Housing, new relative accuracy: 0.847222\n",
|
|
"feature to remove: Purpose\n",
|
|
"Removed feature: Purpose, new relative accuracy: 0.986111\n",
|
|
"feature to remove: debtors\n",
|
|
"Removed feature: debtors, new relative accuracy: 0.986111\n",
|
|
"feature to remove: Job\n",
|
|
"Removed feature: Job, new relative accuracy: 1.000000\n",
|
|
"Accuracy on minimized data: 0.6666666666666666\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import sys\n",
|
|
"import os\n",
|
|
"sys.path.insert(0, os.path.abspath('..'))\n",
|
|
"\n",
|
|
"from apt.minimization import GeneralizeToRepresentative\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"# default target_accuracy is 0.998\n",
|
|
"minimizer = GeneralizeToRepresentative(model, \n",
|
|
" categorical_features=categorical_features, features_to_minimize=QI)\n",
|
|
"\n",
|
|
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
|
|
"# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
|
|
"# data it could result in a larger gap)\n",
|
|
"# Don't forget to leave a hold-out set for final validation!\n",
|
|
"X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
|
|
" test_size = 0.4, random_state = 38)\n",
|
|
"X_generalizer_train.reset_index(drop=True, inplace=True)\n",
|
|
"y_generalizer_train.reset_index(drop=True, inplace=True)\n",
|
|
"x_test.reset_index(drop=True, inplace=True)\n",
|
|
"y_test.reset_index(drop=True, inplace=True)\n",
|
|
"encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
|
|
"x_train_predictions = model.predict(encoded_generalizer_train)\n",
|
|
"minimizer.fit(X_generalizer_train, x_train_predictions, features_names=features)\n",
|
|
"transformed = minimizer.transform(x_test, features_names=features)\n",
|
|
"\n",
|
|
"encoded_transformed = preprocessor.transform(transformed)\n",
|
|
"print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Let's see what features were generalized"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'ranges': {}, 'categories': {}, 'untouched': ['Foreign_worker', 'Other_installment_plans', 'Existing_checking_account', 'Purpose', 'debtors', 'Housing', 'N_people_being_liable_provide_maintenance', 'Present_employment_since', 'Installment_rate', 'Credit_history', 'Property', 'Present_residence', 'Age', 'Credit_amount', 'Duration_in_month', 'Job', 'Personal_status_sex', 'Number_of_existing_credits', 'Savings_account', 'Telephone']}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"generalizations = minimizer.generalizations\n",
|
|
"print(generalizations)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
|
|
"\n",
|
|
"Let's change to a slightly lower target accuracy."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.805556\n",
|
|
"Improving accuracy\n",
|
|
"feature to remove: Credit_history\n",
|
|
"Removed feature: Credit_history, new relative accuracy: 0.819444\n",
|
|
"feature to remove: Other_installment_plans\n",
|
|
"Removed feature: Other_installment_plans, new relative accuracy: 0.847222\n",
|
|
"feature to remove: Duration_in_month\n",
|
|
"Removed feature: Duration_in_month, new relative accuracy: 0.847222\n",
|
|
"feature to remove: Property\n",
|
|
"Removed feature: Property, new relative accuracy: 0.847222\n",
|
|
"feature to remove: Housing\n",
|
|
"Removed feature: Housing, new relative accuracy: 0.847222\n",
|
|
"feature to remove: Purpose\n",
|
|
"Removed feature: Purpose, new relative accuracy: 0.986111\n",
|
|
"Accuracy on minimized data: 0.6666666666666666\n",
|
|
"{'ranges': {}, 'categories': {'debtors': [['A103', 'A102'], ['A101']], 'Job': [['A173', 'A174'], ['A171'], ['A172']]}, 'untouched': ['Credit_amount', 'Duration_in_month', 'Credit_history', 'Foreign_worker', 'Housing', 'Other_installment_plans', 'Property', 'N_people_being_liable_provide_maintenance', 'Present_residence', 'Personal_status_sex', 'Telephone', 'Number_of_existing_credits', 'Present_employment_since', 'Existing_checking_account', 'Savings_account', 'Age', 'Purpose', 'Installment_rate']}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# We allow a 1% deviation in accuracy from the original model accuracy\n",
|
|
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, \n",
|
|
" categorical_features=categorical_features, features_to_minimize=QI)\n",
|
|
"\n",
|
|
"minimizer2.fit(X_generalizer_train, x_train_predictions, features_names=features)\n",
|
|
"transformed2 = minimizer2.transform(x_test, features_names=features)\n",
|
|
"\n",
|
|
"encoded_transformed2 = preprocessor.transform(transformed2)\n",
|
|
"print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test))\n",
|
|
"generalizations2 = minimizer2.generalizations\n",
|
|
"print(generalizations2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This time we were able to generalize two features (debtors and Job)."
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|