diff --git a/README.md b/README.md index 2c38ebb..3058188 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,28 @@ # ai-privacy-toolkit +

+ +

+
+ A toolkit for tools and techniques related to the privacy and compliance of AI models. + +The first release of this toolkit contains a single module called [**anonymization**](apt/anonymization/README.md). +This module contains methods for anonymizing ML model training data, so that when +a model is retrained on the anonymized data, the model itself will also be considered +anonymous. This may help exempt the model from different obligations and restrictions +set out in data protection regulations such as GDPR, CCPA, etc. + +Official ai-privacy-toolkit documentation: + +**Related toolkits:** + +[ai-minimization-toolkit](https://github.com/IBM/ai-minimization-toolkit): A toolkit for +reducing the amount of personal data needed to perform predictions with a machine learning model + +[differential-privacy-library](https://github.com/IBM/differential-privacy-library): A +general-purpose library for experimenting with, investigating and developing applications in, +differential privacy. + +[adversarial-robustness-toolbox](https://github.com/Trusted-AI/adversarial-robustness-toolbox): +A Python library for Machine Learning Security. + diff --git a/apt/__init__.py b/apt/__init__.py new file mode 100644 index 0000000..b96f84a --- /dev/null +++ b/apt/__init__.py @@ -0,0 +1,4 @@ +from apt import anonymization +from apt import utils + +__version__ = "0.0.1" \ No newline at end of file diff --git a/apt/anonymization/README.md b/apt/anonymization/README.md new file mode 100644 index 0000000..3ca161c --- /dev/null +++ b/apt/anonymization/README.md @@ -0,0 +1,22 @@ +# anonymization module +This module contains methods for anonymizing ML model training data, so that when +a model is retrained on the anonymized data, the model itself will also be considered +anonymous. This may help exempt the model from different obligations and restrictions +set out in data protection regulations such as GDPR, CCPA, etc. + +The module contains methods that enable anonymizing training datasets in a manner that +is tailored to and guided by an existing, trained ML model. It uses the existing model's +predictions on the training data to train a second, anonymizer model, that eventually determines +the generalizations that will be applied to the training data. For more information about the +method see: https://arxiv.org/abs/2007.13086 + +Once the anonymized training data is returned, it can be used to retrain the model. + +The following figure depicts the overall process: + +

+ +

+
+ + diff --git a/apt/anonymization/__init__.py b/apt/anonymization/__init__.py new file mode 100644 index 0000000..116b03d --- /dev/null +++ b/apt/anonymization/__init__.py @@ -0,0 +1,17 @@ +""" +Module providing ML anonymization. + +This module contains methods for anonymizing ML model training data, so that when +a model is retrained on the anonymized data, the model itself will also be considered +anonymous. This may help exempt the model from different obligations and restrictions +set out in data protection regulations such as GDPR, CCPA, etc. + +The module contains methods that enable anonymizing training datasets in a manner that +is tailored to and guided by an existing, trained ML model. It uses the existing model's +predictions on the training data to train a second, anonymizer model, that eventually determines +the generalizations that will be applied to the training data. For more information about the +method see: https://arxiv.org/abs/2007.13086 + +Once the anonymized training data is returned, it can be used to retrain the model. +""" +from apt.anonymization.anonymizer import Anonymize diff --git a/apt/anonymization/anonymizer.py b/apt/anonymization/anonymizer.py new file mode 100644 index 0000000..00cf3e5 --- /dev/null +++ b/apt/anonymization/anonymizer.py @@ -0,0 +1,165 @@ +import numpy as np +import pandas as pd +from scipy.spatial import distance +from collections import Counter + +from sklearn.tree import DecisionTreeClassifier + +from typing import Union, Optional + + +class Anonymize: + """ + Class for performing tailored, model-guided anonymization of training datasets for ML models. + + Based on the implementation described in: https://arxiv.org/abs/2007.13086 + """ + def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list]=None): + """ + :param k: The privacy parameter that determines the number of records that will be indistinguishable from each + other (when looking at the quasi identifiers). Should be at least 2. + :param quasi_identifiers: The indexes of the features that need to be anonymized (these should be the features + that may directly, indirectly or in combination with additional data, identify an + individual). + :param categorical_features: The list of categorical features (should only be supplied when passing data as a + pandas dataframe. + """ + if k < 2: + raise ValueError("k should be a positive integer with a value of 2 or higher") + if not quasi_identifiers or len(quasi_identifiers) < 1: + raise ValueError("The list of quasi-identifiers cannot be empty") + + self.k = k + self.quasi_identifiers = quasi_identifiers + self.categorical_features = categorical_features + + def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \ + -> Union[np.ndarray, pd.DataFrame]: + """ + Method for performing model-guided anonymization. + + :param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and + categorical data. + :param y: The predictions of the original model on the training data. + :return: An array containing the anonymized training dataset. + """ + if type(x) == np.ndarray: + return self._anonymize_ndarray(x.copy(), y) + else: # pandas + if not self.categorical_features: + raise ValueError('When supplying a pandas dataframe, categorical_features must be defined') + return self._anonymize_pandas(x.copy(), y) + + def _anonymize_ndarray(self, x, y): + if x.shape[0] != y.shape[0]: + raise ValueError("x and y should have same number of rows") + x_anonymizer_train = x[:, self.quasi_identifiers] + self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k) + self.anonymizer.fit(x_anonymizer_train, y) + cells_by_id = self._calculate_cells(x, x_anonymizer_train) + return self._anonymize_data_numpy(x, x_anonymizer_train, cells_by_id) + + def _anonymize_pandas(self, x, y): + if x.shape[0] != y.shape[0]: + raise ValueError("x and y should have same number of rows") + x_anonymizer_train = x.loc[:, self.quasi_identifiers] + # need to one-hot encode before training the decision tree + x_prepared = self._modify_categorical_features(x_anonymizer_train) + self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k) + self.anonymizer.fit(x_prepared, y) + cells_by_id = self._calculate_cells(x, x_prepared) + return self._anonymize_data_pandas(x, x_prepared, cells_by_id) + + def _calculate_cells(self, x, x_anonymizer_train): + # x is original data, x_anonymizer_train is only QIs + 1-hot encoded + cells_by_id = {} + leaves = [] + for node, feature in enumerate(self.anonymizer.tree_.feature): + if feature == -2: # leaf node + leaves.append(node) + hist = [int(i) for i in self.anonymizer.tree_.value[node][0]] + label_hist = self.anonymizer.tree_.value[node][0] + label = int(self.anonymizer.classes_[np.argmax(label_hist)]) + cell = {'label': label, 'hist': hist, 'id': int(node)} + cells_by_id[cell['id']] = cell + self.nodes = leaves + self._find_representatives(x, x_anonymizer_train, cells_by_id.values()) + return cells_by_id + + def _find_representatives(self, x, x_anonymizer_train, cells): + # x is original data, x_anonymizer_train is only QIs + 1-hot encoded + node_ids = self._find_sample_nodes(x_anonymizer_train) + for cell in cells: + cell['representative'] = {} + # get all rows in cell + indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']] + # TODO: should we filter only those with majority label? (using hist) + if type(x) == np.ndarray: + rows = x[indexes] + else: # pandas + rows = x.iloc[indexes] + for feature in self.quasi_identifiers: + if type(x) == np.ndarray: + values = rows[:, feature] + else: # pandas + values = rows.loc[:, feature] + if self.categorical_features and feature in self.categorical_features: + # find most common value + cell['representative'][feature] = Counter(values).most_common(1)[0][0] + else: + # find the mean value (per feature) + median = np.median(values) + min_value = max(values) + min_dist = float("inf") + for value in values: + dist = distance.euclidean(value, median) + if dist < min_dist: + min_dist = dist + min_value = value + cell['representative'][feature] = min_value + + def _find_sample_nodes(self, samples): + paths = self.anonymizer.decision_path(samples).toarray() + node_set = set(self.nodes) + return [(list(set([i for i, v in enumerate(p) if v == 1]) & node_set))[0] for p in paths] + + def _find_sample_cells(self, samples, cells_by_id): + node_ids = self._find_sample_nodes(samples) + return [cells_by_id[node_id] for node_id in node_ids] + + def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id): + cells = self._find_sample_cells(x_anonymizer_train, cells_by_id) + index = 0 + for row in x: + cell = cells[index] + index += 1 + for feature in cell['representative']: + row[feature] = cell['representative'][feature] + return x + + def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id): + cells = self._find_sample_cells(x_anonymizer_train, cells_by_id) + index = 0 + for i, row in x.iterrows(): + cell = cells[index] + index += 1 + for feature in cell['representative']: + x.at[i, feature] = cell['representative'][feature] + return x + + def _modify_categorical_features(self, x): # only for pandas + self.categorical_values = {} + self.one_hot_to_features = {} + features_to_remove = [] + for feature in self.categorical_features: + if feature in self.quasi_identifiers: + all_values = x.loc[:, feature] + values = list(all_values.unique()) + self.categorical_values[feature] = values + x[feature] = pd.Categorical(x.loc[:, feature], categories=values, ordered=False) + one_hot_vector = pd.get_dummies(x[feature], prefix=feature) + for one_hot_vector_feature in one_hot_vector.columns: + self.one_hot_to_features[one_hot_vector_feature] = feature + x = pd.concat([x, one_hot_vector], axis=1) + features_to_remove.append(feature) + return x.drop(features_to_remove, axis=1) diff --git a/apt/utils.py b/apt/utils.py new file mode 100644 index 0000000..b7aa78a --- /dev/null +++ b/apt/utils.py @@ -0,0 +1,219 @@ +from sklearn import datasets, model_selection +import sklearn.preprocessing +import pandas as pd +import ssl +from os import path +from six.moves.urllib.request import urlretrieve + + +def _load_iris(test_set_size: float=0.3): + iris = datasets.load_iris() + data = iris.data + labels = iris.target + + # Split training and test sets + x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size, + random_state=18, stratify=labels, + shuffle=True) + + return (x_train, y_train), (x_test, y_test) + + +def get_iris_dataset(): + """ + Loads the Iris dataset from scikit-learn. + + :param test_set: Proportion of the data to use as validation split (value between 0 and 1). + :return: Entire dataset and labels as numpy array. + """ + return _load_iris() + + +def get_adult_dataset(): + """ + Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary. + + :return: Dataset and labels as pandas dataframes. + """ + features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', + 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', + 'label'] + train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' + test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' + train_file = '../datasets/adult/train' + test_file = '../datasets/adult/test' + + ssl._create_default_https_context = ssl._create_unverified_context + if not path.exists(train_file): + urlretrieve(train_url, train_file) + if not path.exists(test_file): + urlretrieve(test_url, test_file) + + train = pd.read_csv(train_file, sep=', ', names=features, engine='python') + test = pd.read_csv(test_file, sep=', ', names=features, engine='python') + test = test.iloc[1:] + + train = _modify_adult_dataset(train) + test = _modify_adult_dataset(test) + + x_train = train.drop(['label'], axis=1) + y_train = train.loc[:, 'label'] + x_test = test.drop(['label'], axis=1) + y_test = test.loc[:, 'label'] + + return (x_train, y_train), (x_test, y_test) + + +def _modify_adult_dataset(data): + def modify_label(value): + if value == '<=50K.' or value == '<=50K': + return 0 + elif value == '>50K.' or value == '>50K': + return 1 + else: + raise Exception('Bad label value') + + def modify_native_country(value): + Euro_1 = ['Italy', 'Holand-Netherlands', 'Germany', 'France'] + Euro_2 = ['Yugoslavia', 'South', 'Portugal', 'Poland', 'Hungary', 'Greece'] + SE_Asia = ['Vietnam', 'Thailand', 'Philippines', 'Laos', 'Cambodia'] + UnitedStates = ['United-States'] + LatinAmerica = ['Trinadad&Tobago', 'Puerto-Rico', 'Outlying-US(Guam-USVI-etc)', 'Nicaragua', 'Mexico', + 'Jamaica', 'Honduras', 'Haiti', 'Guatemala', 'Dominican-Republic'] + China = ['Taiwan', 'Hong', 'China'] + BritishCommonwealth = ['Scotland', 'Ireland', 'India', 'England', 'Canada'] + SouthAmerica = ['Peru', 'El-Salvador', 'Ecuador', 'Columbia'] + Other = ['Japan', 'Iran', 'Cuba'] + + if value in Euro_1: + return 'Euro_1' + elif value in Euro_2: + return 'Euro_2' + elif value in SE_Asia: + return 'SE_Asia' + elif value in UnitedStates: + return 'UnitedStates' + elif value in LatinAmerica: + return 'LatinAmerica' + elif value in China: + return 'China' + elif value in BritishCommonwealth: + return 'BritishCommonwealth' + elif value in SouthAmerica: + return 'SouthAmerica' + elif value in Other: + return 'Other' + elif value == '?': + return 'Unknown' + else: + raise Exception('Bad native country value') + + data['label'] = data['label'].apply(modify_label) + data['native-country'] = data['native-country'].apply(modify_native_country) + + for col in ('age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'): + try: + data[col] = data[col].fillna(0) + except KeyError: + print('missing column ' + col) + + for col in ('workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'): + try: + data[col] = data[col].fillna('NA') + except KeyError: + print('missing column ' + col) + + return data.drop(['fnlwgt', 'education'], axis=1) + + +def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False): + """ + Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it if necessary. + + :param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot + encoded and data is scaled using sklearn's StandardScaler. + :param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1. + :param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute + inference. This is done by assigning the original value 'problematic' the new value 1, and + the other original values are assigned the new value 0. + :return: Dataset and labels as pandas dataframes. + """ + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data' + data_file = '../datasets/nursery/data' + + ssl._create_default_https_context = ssl._create_unverified_context + if not path.exists(data_file): + urlretrieve(url, data_file) + + # load data + features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health", "label"] + categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health"] + data = pd.read_csv(data_file, sep=",", names=features, engine="python") + # remove rows with missing label or too sparse label + data = data.dropna(subset=["label"]) + data.drop(data.loc[data["label"] == "recommend"].index, axis=0, inplace=True) + + # fill missing values + data["children"] = data["children"].fillna(0) + + for col in ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]: + data[col] = data[col].fillna("other") + + # make categorical label + def modify_label(value): # 5 classes + if value == "not_recom": + return 0 + elif value == "very_recom": + return 1 + elif value == "priority": + return 2 + elif value == "spec_prior": + return 3 + else: + raise Exception("Bad label value: %s" % value) + + data["label"] = data["label"].apply(modify_label) + data["children"] = data["children"].apply(lambda x: 4 if x == "more" else x) + + if transform_social: + + def modify_social(value): + if value == "problematic": + return 1 + else: + return 0 + + data["social"] = data["social"].apply(modify_social) + categorical_features.remove("social") + + if not raw: + # one-hot-encode categorical features + features_to_remove = [] + for feature in categorical_features: + all_values = data.loc[:, feature] + values = list(all_values.unique()) + data[feature] = pd.Categorical(data.loc[:, feature], categories=values, ordered=False) + one_hot_vector = pd.get_dummies(data[feature], prefix=feature) + data = pd.concat([data, one_hot_vector], axis=1) + features_to_remove.append(feature) + data = data.drop(features_to_remove, axis=1) + + # normalize data + label = data.loc[:, "label"] + features = data.drop(["label"], axis=1) + scaler = sklearn.preprocessing.StandardScaler() + scaler.fit(features) + scaled_features = pd.DataFrame(scaler.transform(features), columns=features.columns) + data = pd.concat([label, scaled_features], axis=1, join="inner") + + # Split training and test sets + stratified = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_set, random_state=18) + for train_set, test_set in stratified.split(data, data["label"]): + train = data.iloc[train_set] + test = data.iloc[test_set] + x_train = train.drop(["label"], axis=1) + y_train = train.loc[:, "label"] + x_test = test.drop(["label"], axis=1) + y_test = test.loc[:, "label"] + + return (x_train, y_train), (x_test, y_test) diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/images/AI_Privacy_project2.jpg b/docs/images/AI_Privacy_project2.jpg new file mode 100644 index 0000000..9f6dbb8 Binary files /dev/null and b/docs/images/AI_Privacy_project2.jpg differ diff --git a/docs/images/logo with text.jpg b/docs/images/logo with text.jpg new file mode 100644 index 0000000..797f75a Binary files /dev/null and b/docs/images/logo with text.jpg differ diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..9534b01 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/apt.anonymization.rst b/docs/source/apt.anonymization.rst new file mode 100644 index 0000000..6453554 --- /dev/null +++ b/docs/source/apt.anonymization.rst @@ -0,0 +1,22 @@ +apt.anonymization package +========================= + +Submodules +---------- + +apt.anonymization.anonymizer module +----------------------------------- + +.. automodule:: apt.anonymization.anonymizer + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: apt.anonymization + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apt.rst b/docs/source/apt.rst new file mode 100644 index 0000000..21abe0a --- /dev/null +++ b/docs/source/apt.rst @@ -0,0 +1,18 @@ +apt package +=========== + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + apt.anonymization + +Module contents +--------------- + +.. automodule:: apt + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..1c8673a --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,57 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'ai-privacy-toolkit' +copyright = '2021, Abigail Goldsteen' +author = 'Abigail Goldsteen' + +# The full version, including alpha/beta/rc tags +release = '0.0.1' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..0e0b75e --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,37 @@ +.. ai-privacy-toolkit documentation master file, created by + sphinx-quickstart on Mon Feb 15 12:42:20 2021. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to ai-privacy-toolkit's documentation! +============================================== + +This project provides tools for assessing and improving the privacy and compliance of AI models. + +The first release of this toolkit contains a single module called anonymization. This +module contains methods for anonymizing ML model training data, so that when +a model is retrained on the anonymized data, the model itself will also be considered +anonymous. This may help exempt the model from different obligations and restrictions +set out in data protection regulations such as GDPR, CCPA, etc. + +.. toctree:: + :maxdepth: 2 + :caption: Getting Started: + + quick_start + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: API + + apt + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/modules.rst b/docs/source/modules.rst new file mode 100644 index 0000000..9b7541d --- /dev/null +++ b/docs/source/modules.rst @@ -0,0 +1,8 @@ +ai-privacy-toolkit +================== + +.. toctree:: + :maxdepth: 4 + + apt + tests diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst new file mode 100644 index 0000000..f59d970 --- /dev/null +++ b/docs/source/quick_start.rst @@ -0,0 +1,16 @@ +########################################### +Getting started with the AI Privacy toolkit +########################################### + + +Download the toolkit code: +========================== + +Clone the ``ai-minimization-toolkit`` repository:: + + $ git clone https://github.com/IBM/ai-privacy-toolkit.git + +Or download using pip:: + + pip install ai-privacy-toolkit==0.0.1 + diff --git a/docs/source/tests.rst b/docs/source/tests.rst new file mode 100644 index 0000000..4efe9ed --- /dev/null +++ b/docs/source/tests.rst @@ -0,0 +1,30 @@ +tests package +============= + +Submodules +---------- + +tests.test\_anonymizer module +----------------------------- + +.. automodule:: tests.test_anonymizer + :members: + :undoc-members: + :show-inheritance: + +tests.utils module +------------------ + +.. automodule:: tests.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: tests + :members: + :undoc-members: + :show-inheritance: diff --git a/notebooks/attribute_inference_anonymization_nursery.ipynb b/notebooks/attribute_inference_anonymization_nursery.ipynb new file mode 100644 index 0000000..9952885 --- /dev/null +++ b/notebooks/attribute_inference_anonymization_nursery.ipynb @@ -0,0 +1,1165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using ML anonymization to defend against attribute inference attacks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we will show how to anonymize models using the ML anonymization module. \n", + "\n", + "We will demonstrate running inference attacks both on a vanilla model, and then on different anonymized versions of the model. We will run both black-box and white-box attribute inference attacks using ART's inference module (https://github.com/Trusted-AI/adversarial-robustness-toolbox/tree/main/art/attacks/inference). \n", + "\n", + "This will be demonstarted using the Nursery dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/nursery). \n", + "\n", + "The sensitive feature we are trying to infer is the 'social' feature, after turning it into a binary feature (the original value 'problematic' receives the new value 1 and the rest 0). We also preprocess the data such that all categorical features are one-hot encoded." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
parentshas_nursformchildrenhousingfinancesocialhealth
8450pretentiousvery_critfoster1less_convconvenient1not_recom
12147great_pretvery_critcomplete1criticalinconv1recommended
2780usualcriticalcomplete4less_convconvenient1not_recom
11924great_pretcriticalfoster1criticalconvenient1not_recom
59usualpropercomplete2convenientconvenient0not_recom
...........................
5193pretentiousless_propercomplete1convenientinconv0recommended
1375usualless_properincomplete2less_convconvenient1priority
10318great_pretless_properfoster4convenientconvenient0priority
6396pretentiousimpropercompleted3less_convconvenient1recommended
485usualproperincomplete1criticalinconv1not_recom
\n", + "

10366 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " parents has_nurs form children housing finance \\\n", + "8450 pretentious very_crit foster 1 less_conv convenient \n", + "12147 great_pret very_crit complete 1 critical inconv \n", + "2780 usual critical complete 4 less_conv convenient \n", + "11924 great_pret critical foster 1 critical convenient \n", + "59 usual proper complete 2 convenient convenient \n", + "... ... ... ... ... ... ... \n", + "5193 pretentious less_proper complete 1 convenient inconv \n", + "1375 usual less_proper incomplete 2 less_conv convenient \n", + "10318 great_pret less_proper foster 4 convenient convenient \n", + "6396 pretentious improper completed 3 less_conv convenient \n", + "485 usual proper incomplete 1 critical inconv \n", + "\n", + " social health \n", + "8450 1 not_recom \n", + "12147 1 recommended \n", + "2780 1 not_recom \n", + "11924 1 not_recom \n", + "59 0 not_recom \n", + "... ... ... \n", + "5193 0 recommended \n", + "1375 1 priority \n", + "10318 0 priority \n", + "6396 1 recommended \n", + "485 1 not_recom \n", + "\n", + "[10366 rows x 8 columns]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import sys\n", + "sys.path.insert(0, os.path.abspath('..'))\n", + "\n", + "from apt.utils import get_nursery_dataset\n", + "\n", + "(x_train, y_train), (x_test, y_test) = get_nursery_dataset(transform_social=True)\n", + "\n", + "x_train" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train decision tree model" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base model accuracy: 0.9969135802469136\n" + ] + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "x_train_str = x_train.astype(str)\n", + "train_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_train_str)\n", + "x_test_str = x_test.astype(str)\n", + "test_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_test_str)\n", + " \n", + "model = DecisionTreeClassifier()\n", + "model.fit(train_encoded, y_train)\n", + "\n", + "art_classifier = ScikitlearnDecisionTreeClassifier(model)\n", + "\n", + "print('Base model accuracy: ', model.score(test_encoded, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attack\n", + "### Black-box attack\n", + "The black-box attack basically trains an additional classifier (called the attack model) to predict the attacked feature's value from the remaining n-1 features as well as the original (attacked) model's predictions.\n", + "#### Train attack model" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from art.attacks.inference.attribute_inference import AttributeInferenceBlackBox\n", + "\n", + "attack_feature = 20\n", + "\n", + "# training data without attacked feature\n", + "x_train_for_attack = np.delete(train_encoded, attack_feature, 1)\n", + "# only attacked feature\n", + "x_train_feature = train_encoded[:, attack_feature].copy().reshape(-1, 1)\n", + "\n", + "bb_attack = AttributeInferenceBlackBox(art_classifier, attack_feature=attack_feature)\n", + "\n", + "# get original model's predictions\n", + "x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(train_encoded)]).reshape(-1,1)\n", + "\n", + "# use half of training set for training the attack\n", + "attack_train_ratio = 0.5\n", + "attack_train_size = int(len(train_encoded) * attack_train_ratio)\n", + "\n", + "# train attack model\n", + "bb_attack.fit(train_encoded[:attack_train_size])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Infer sensitive feature and check accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6430638626278217\n" + ] + } + ], + "source": [ + "# get inferred values\n", + "values=[0, 1]\n", + "\n", + "inferred_train_bb = bb_attack.infer(x_train_for_attack[attack_train_size:], x_train_predictions[attack_train_size:], values=values)\n", + "# check accuracy\n", + "train_acc = np.sum(inferred_train_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_bb)\n", + "print(train_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This means that for 64% of the training set, the attacked feature is inferred correctly using this attack." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Whitebox attack\n", + "This attack does not train any additional model, it simply uses additional information coded within the attacked decision tree model to compute the probability of each value of the attacked feature and outputs the value with the highest probability." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6980513216284006\n" + ] + } + ], + "source": [ + "from art.attacks.inference.attribute_inference import AttributeInferenceWhiteBoxDecisionTree\n", + "\n", + "priors = [6925 / 10366, 3441 / 10366]\n", + "\n", + "wb2_attack = AttributeInferenceWhiteBoxDecisionTree(art_classifier, attack_feature=attack_feature)\n", + "\n", + "# get inferred values\n", + "inferred_train_wb2 = wb2_attack.infer(x_train_for_attack, x_train_predictions, values=values, priors=priors)\n", + "\n", + "# check accuracy\n", + "train_acc = np.sum(inferred_train_wb2 == np.around(x_train_feature, decimals=8).reshape(1,-1)) / len(inferred_train_wb2)\n", + "print(train_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The white-box attack is able to correctly infer the attacked feature value in 69% of the training set. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Anonymized data\n", + "## k=100\n", + "\n", + "Now we will apply the same attacks on an anonymized version of the same dataset (k=100). The data is anonymized on the quasi-identifiers: finance, social, health.\n", + "\n", + "k=100 means that each record in the anonymized dataset is identical to 99 others on the quasi-identifier values (i.e., when looking only at those 3 feature, the records are indistinguishable)." + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
parentshas_nursformchildrenhousingfinancesocialhealth
8450pretentiousvery_critfoster1less_convconvenient0not_recom
12147great_pretvery_critcomplete1criticalinconv1recommended
2780usualcriticalcomplete4less_convconvenient0not_recom
11924great_pretcriticalfoster1criticalconvenient0not_recom
59usualpropercomplete2convenientconvenient0not_recom
...........................
5193pretentiousless_propercomplete1convenientinconv0recommended
1375usualless_properincomplete2less_convconvenient1priority
10318great_pretless_properfoster4convenientconvenient0priority
6396pretentiousimpropercompleted3less_convconvenient1recommended
485usualproperincomplete1criticalconvenient0not_recom
\n", + "

10366 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " parents has_nurs form children housing finance \\\n", + "8450 pretentious very_crit foster 1 less_conv convenient \n", + "12147 great_pret very_crit complete 1 critical inconv \n", + "2780 usual critical complete 4 less_conv convenient \n", + "11924 great_pret critical foster 1 critical convenient \n", + "59 usual proper complete 2 convenient convenient \n", + "... ... ... ... ... ... ... \n", + "5193 pretentious less_proper complete 1 convenient inconv \n", + "1375 usual less_proper incomplete 2 less_conv convenient \n", + "10318 great_pret less_proper foster 4 convenient convenient \n", + "6396 pretentious improper completed 3 less_conv convenient \n", + "485 usual proper incomplete 1 critical convenient \n", + "\n", + " social health \n", + "8450 0 not_recom \n", + "12147 1 recommended \n", + "2780 0 not_recom \n", + "11924 0 not_recom \n", + "59 0 not_recom \n", + "... ... ... \n", + "5193 0 recommended \n", + "1375 1 priority \n", + "10318 0 priority \n", + "6396 1 recommended \n", + "485 0 not_recom \n", + "\n", + "[10366 rows x 8 columns]" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from apt.anonymization import Anonymize\n", + "\n", + "QI = [\"finance\", \"social\", \"health\"]\n", + "categorical_features = [\"parents\", \"has_nurs\", \"form\", \"housing\", \"finance\", \"health\", 'children']\n", + "anonymizer = Anonymize(100, QI, categorical_features=categorical_features)\n", + "anon = anonymizer.anonymize(x_train, x_train_predictions)\n", + "anon" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7585" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# number of distinct rows in original data\n", + "len(x_train.drop_duplicates())" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5766" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# number of distinct rows in anonymized data\n", + "len(anon.drop_duplicates())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train decision tree model" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Anonymized model accuracy: 0.9976851851851852\n" + ] + } + ], + "source": [ + "anon_str = anon.astype(str)\n", + "anon_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon_str)\n", + "\n", + "anon_model = DecisionTreeClassifier()\n", + "anon_model.fit(anon_encoded, y_train)\n", + "\n", + "anon_art_classifier = ScikitlearnDecisionTreeClassifier(anon_model)\n", + "\n", + "print('Anonymized model accuracy: ', anon_model.score(test_encoded, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attack\n", + "### Black-box attack" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6471155701331275\n" + ] + } + ], + "source": [ + "anon_bb_attack = AttributeInferenceBlackBox(anon_art_classifier, attack_feature=attack_feature)\n", + "\n", + "# get original model's predictions\n", + "anon_x_train_predictions = np.array([np.argmax(arr) for arr in anon_art_classifier.predict(train_encoded)]).reshape(-1,1)\n", + "\n", + "# train attack model\n", + "anon_bb_attack.fit(train_encoded[:attack_train_size])\n", + "\n", + "# get inferred values\n", + "inferred_train_anon_bb = anon_bb_attack.infer(x_train_for_attack[attack_train_size:], anon_x_train_predictions[attack_train_size:], values=values)\n", + "# check accuracy\n", + "train_acc = np.sum(inferred_train_anon_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_anon_bb)\n", + "print(train_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### White box attack" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6982442600810341\n" + ] + } + ], + "source": [ + "anon_wb2_attack = AttributeInferenceWhiteBoxDecisionTree(anon_art_classifier, attack_feature=attack_feature)\n", + "\n", + "# get inferred values\n", + "inferred_train_anon_wb2 = anon_wb2_attack.infer(x_train_for_attack, anon_x_train_predictions, values=values, priors=priors)\n", + "\n", + "# check accuracy\n", + "anon_train_acc = np.sum(inferred_train_anon_wb2 == np.around(x_train_feature, decimals=8).reshape(1,-1)) / len(inferred_train_anon_wb2)\n", + "print(anon_train_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The accuracy of the attacks remains more or less the same. Let's check the precision and recall for each case:" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0.33056202194878614, 0.2888695146759663)\n", + "(0.34112301200908796, 0.3054344667247893)\n" + ] + } + ], + "source": [ + "def calc_precision_recall(predicted, actual, positive_value=1):\n", + " score = 0 # both predicted and actual are positive\n", + " num_positive_predicted = 0 # predicted positive\n", + " num_positive_actual = 0 # actual positive\n", + " for i in range(len(predicted)):\n", + " if predicted[i] == positive_value:\n", + " num_positive_predicted += 1\n", + " if actual[i] == positive_value:\n", + " num_positive_actual += 1\n", + " if predicted[i] == actual[i]:\n", + " if predicted[i] == positive_value:\n", + " score += 1\n", + " \n", + " if num_positive_predicted == 0:\n", + " precision = 1\n", + " else:\n", + " precision = score / num_positive_predicted # the fraction of predicted “Yes” responses that are correct\n", + " if num_positive_actual == 0:\n", + " recall = 1\n", + " else:\n", + " recall = score / num_positive_actual # the fraction of “Yes” responses that are predicted correctly\n", + "\n", + " return precision, recall\n", + " \n", + "# black-box regular\n", + "print(calc_precision_recall(inferred_train_bb, x_train_feature))\n", + "# black-box anonymized\n", + "print(calc_precision_recall(inferred_train_anon_bb, x_train_feature))" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0.6457357075913777, 0.2002324905550712)\n", + "(0.6472248353715898, 0.1999418773612322)\n" + ] + } + ], + "source": [ + "# white-box regular\n", + "print(calc_precision_recall(inferred_train_wb2, x_train_feature))\n", + "# white-box anonymized\n", + "print(calc_precision_recall(inferred_train_anon_wb2, x_train_feature))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Precision and recall remain almost the same, sometimes even slightly increasing.\n", + "\n", + "Now let's see what happens when we increase k to 1000." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## k=1000\n", + "\n", + "Now we apply the attacks on an anonymized version of the same dataset (k=1000). The data has been anonymized on the quasi-identifiers: finance, social, health." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "anonymizer2 = Anonymize(1000, QI, categorical_features=categorical_features)\n", + "anon2 = anonymizer2.anonymize(x_train, x_train_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4226" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# number of distinct rows in anonymized data\n", + "len(anon2.drop_duplicates())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train decision tree model" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Anonymized model accuracy: 0.9930555555555556\n" + ] + } + ], + "source": [ + "anon2_str = anon2.astype(str)\n", + "anon2_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon2_str)\n", + "\n", + "anon2_model = DecisionTreeClassifier()\n", + "anon2_model.fit(anon2_encoded, y_train)\n", + "\n", + "anon2_art_classifier = ScikitlearnDecisionTreeClassifier(anon2_model)\n", + "\n", + "print('Anonymized model accuracy: ', anon2_model.score(test_encoded, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attack\n", + "### Black-box attack" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6266640941539648\n" + ] + } + ], + "source": [ + "anon2_bb_attack = AttributeInferenceBlackBox(anon2_art_classifier, attack_feature=attack_feature)\n", + "\n", + "# get original model's predictions\n", + "anon2_x_train_predictions = np.array([np.argmax(arr) for arr in anon2_art_classifier.predict(train_encoded)]).reshape(-1,1)\n", + "\n", + "# train attack model\n", + "anon2_bb_attack.fit(train_encoded[:attack_train_size])\n", + "\n", + "# get inferred values\n", + "inferred_train_anon2_bb = anon2_bb_attack.infer(x_train_for_attack[attack_train_size:], anon2_x_train_predictions[attack_train_size:], values=values)\n", + "# check accuracy\n", + "train_acc = np.sum(inferred_train_anon2_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_anon2_bb)\n", + "print(train_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### White box attack" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6944819602546788\n" + ] + } + ], + "source": [ + "anon2_wb2_attack = AttributeInferenceWhiteBoxDecisionTree(anon2_art_classifier, attack_feature=attack_feature)\n", + "\n", + "# get inferred values\n", + "inferred_train_anon2_wb2 = anon2_wb2_attack.infer(x_train_for_attack, anon2_x_train_predictions, values=values, priors=priors)\n", + "\n", + "# check accuracy\n", + "train_acc = np.sum(inferred_train_anon2_wb2 == np.around(x_train_feature, decimals=8).reshape(1,-1)) / len(inferred_train_anon_wb2)\n", + "print(train_acc)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0.35793357933579334, 0.17037470725995316)\n", + "(0.3360655737704918, 0.1680327868852459)\n", + "(0.6457357075913777, 0.2002324905550712)\n", + "(0.6327519379844961, 0.1897704155768672)\n" + ] + } + ], + "source": [ + "# black-box regular\n", + "print(calc_precision_recall(inferred_train_bb, x_train_feature))\n", + "# black-box anonymized\n", + "print(calc_precision_recall(inferred_train_anon2_bb, x_train_feature))\n", + "\n", + "# white-box regular\n", + "print(calc_precision_recall(inferred_train_wb2, x_train_feature))\n", + "# white-box anonymized\n", + "print(calc_precision_recall(inferred_train_anon2_wb2, x_train_feature))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The accuracy of the black-box attack is slightly reduced, as well as the precision and recall in both attacks." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## k=100, all QI\n", + "Now let's see what happens if we define all 8 features in the Nursery dataset as quasi-identifiers." + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "QI2 = [\"parents\", \"has_nurs\", \"form\", \"children\", \"housing\", \"finance\", \"social\", \"health\"]\n", + "anonymizer3 = Anonymize(100, QI2, categorical_features=categorical_features)\n", + "anon3 = anonymizer3.anonymize(x_train, x_train_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "39" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# number of distinct rows in anonymized data\n", + "len(anon3.drop_duplicates())" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Anonymized model accuracy: 0.7723765432098766\n", + "BB attack accuracy: 0.5792012348060969\n", + "WB attack accuracy: 0.6680493922438742\n" + ] + } + ], + "source": [ + "anon3_str = anon3.astype(str)\n", + "anon3_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon3_str)\n", + "\n", + "anon3_model = DecisionTreeClassifier()\n", + "anon3_model.fit(anon3_encoded, y_train)\n", + "\n", + "anon3_art_classifier = ScikitlearnDecisionTreeClassifier(anon3_model)\n", + "\n", + "print('Anonymized model accuracy: ', anon3_model.score(test_encoded, y_test))\n", + "\n", + "anon3_bb_attack = AttributeInferenceBlackBox(anon3_art_classifier, attack_feature=attack_feature)\n", + "\n", + "# get original model's predictions\n", + "anon3_x_train_predictions = np.array([np.argmax(arr) for arr in anon3_art_classifier.predict(train_encoded)]).reshape(-1,1)\n", + "\n", + "# train attack model\n", + "anon3_bb_attack.fit(train_encoded[:attack_train_size])\n", + "\n", + "# get inferred values\n", + "inferred_train_anon3_bb = anon3_bb_attack.infer(x_train_for_attack[attack_train_size:], anon3_x_train_predictions[attack_train_size:], values=values)\n", + "# check accuracy\n", + "train_acc = np.sum(inferred_train_anon3_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_anon2_bb)\n", + "print('BB attack accuracy: ', train_acc)\n", + "\n", + "anon3_wb2_attack = AttributeInferenceWhiteBoxDecisionTree(anon3_art_classifier, attack_feature=attack_feature)\n", + "\n", + "# get inferred values\n", + "inferred_train_anon3_wb2 = anon3_wb2_attack.infer(x_train_for_attack, anon3_x_train_predictions, values=values, priors=priors)\n", + "\n", + "# check accuracy\n", + "train_acc = np.sum(inferred_train_anon3_wb2 == np.around(x_train_feature, decimals=8).reshape(1,-1)) / len(inferred_train_anon_wb2)\n", + "print('WB attack accuracy: ', train_acc)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0.35793357933579334, 0.17037470725995316)\n", + "(0.3393939393939394, 0.13114754098360656)\n", + "(0.6457357075913777, 0.2002324905550712)\n", + "(1, 0.0)\n" + ] + } + ], + "source": [ + "# black-box regular\n", + "print(calc_precision_recall(inferred_train_bb, x_train_feature))\n", + "# black-box anonymized\n", + "print(calc_precision_recall(inferred_train_anon3_bb, x_train_feature))\n", + "\n", + "# white-box regular\n", + "print(calc_precision_recall(inferred_train_wb2, x_train_feature))\n", + "# white-box anonymized\n", + "print(calc_precision_recall(inferred_train_anon3_wb2, x_train_feature))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Accuracy of both attacks has decreased. Precision and recall remain roughly the same in the black-box case. \n", + "\n", + "*In the anonymized version of the white-box attack, no records were predicted with the positive value for the attacked feature." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/membership_inference_anonymization_adult.ipynb b/notebooks/membership_inference_anonymization_adult.ipynb new file mode 100644 index 0000000..961a739 --- /dev/null +++ b/notebooks/membership_inference_anonymization_adult.ipynb @@ -0,0 +1,422 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using ML anonymization to defend against membership inference attacks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we will show how to anonymize models using the ML anonymization module. \n", + "\n", + "We will demonstrate running inference attacks both on a vanilla model, and then on an anonymized version of the model. We will run a black-box membership inference attack using ART's inference module (https://github.com/Trusted-AI/adversarial-robustness-toolbox/tree/main/art/attacks/inference). \n", + "\n", + "This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/nursery). \n", + "\n", + "For simplicity, we used only the numerical features in the dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 39. 13. 2174. 0. 40.]\n", + " [ 50. 13. 0. 0. 13.]\n", + " [ 38. 9. 0. 0. 40.]\n", + " ...\n", + " [ 27. 13. 0. 0. 40.]\n", + " [ 26. 11. 0. 0. 48.]\n", + " [ 27. 9. 0. 0. 40.]]\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n", + "x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n", + " usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n", + "\n", + "y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n", + " usecols=14, dtype=str, delimiter=\", \")\n", + "\n", + "\n", + "x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n", + " usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n", + "\n", + "y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n", + " usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n", + "\n", + "# Trim trailing period \".\" from label\n", + "y_test = np.array([a[:-1] for a in y_test])\n", + "\n", + "y_train[y_train == '<=50K'] = 0\n", + "y_train[y_train == '>50K'] = 1\n", + "y_train = y_train.astype(np.int)\n", + "\n", + "y_test[y_test == '<=50K'] = 0\n", + "y_test[y_test == '>50K'] = 1\n", + "y_test = y_test.astype(np.int)\n", + "\n", + "# get balanced dataset\n", + "x_train = x_train[:x_test.shape[0]]\n", + "y_train = y_train[:y_test.shape[0]]\n", + "\n", + "print(x_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train decision tree model" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base model accuracy: 0.8075056814691972\n" + ] + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier\n", + "\n", + "model = DecisionTreeClassifier()\n", + "model.fit(x_train, y_train)\n", + "\n", + "art_classifier = ScikitlearnDecisionTreeClassifier(model)\n", + "\n", + "print('Base model accuracy: ', model.score(x_test, y_test))\n", + "\n", + "x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)]).reshape(-1,1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attack\n", + "The black-box attack basically trains an additional classifier (called the attack model) to predict the membership status of a sample.\n", + "#### Train attack model" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [], + "source": [ + "from art.attacks.inference.membership_inference import MembershipInferenceBlackBox\n", + "\n", + "# attack_model_type can be nn (neural network), rf (randon forest) or gb (gradient boosting)\n", + "bb_attack = MembershipInferenceBlackBox(art_classifier, attack_model_type='rf')\n", + "\n", + "# use half of each dataset for training the attack\n", + "attack_train_ratio = 0.5\n", + "attack_train_size = int(len(x_train) * attack_train_ratio)\n", + "attack_test_size = int(len(x_test) * attack_train_ratio)\n", + "\n", + "# train attack model\n", + "bb_attack.fit(x_train[:attack_train_size], y_train[:attack_train_size],\n", + " x_test[:attack_test_size], y_test[:attack_test_size])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Infer sensitive feature and check accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5440363591696352\n" + ] + } + ], + "source": [ + "# get inferred values for remaining half\n", + "inferred_train_bb = bb_attack.infer(x_train[attack_train_size:], y_train[attack_train_size:])\n", + "inferred_test_bb = bb_attack.infer(x_test[attack_test_size:], y_test[attack_test_size:])\n", + "# check accuracy\n", + "train_acc = np.sum(inferred_train_bb) / len(inferred_train_bb)\n", + "test_acc = 1 - (np.sum(inferred_test_bb) / len(inferred_test_bb))\n", + "acc = (train_acc * len(inferred_train_bb) + test_acc * len(inferred_test_bb)) / (len(inferred_train_bb) + len(inferred_test_bb))\n", + "print(acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This means that for 54% of the data, membership is inferred correctly using this attack." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Anonymized data\n", + "## k=100\n", + "\n", + "Now we will apply the same attacks on an anonymized version of the same dataset (k=100). The data is anonymized on the quasi-identifiers: age, education-num, capital-gain, hours-per-week.\n", + "\n", + "k=100 means that each record in the anonymized dataset is identical to 99 others on the quasi-identifier values (i.e., when looking only at those features, the records are indistinguishable)." + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[38. 13. 0. 0. 40.]\n", + " [57. 13. 0. 0. 30.]\n", + " [37. 9. 0. 0. 40.]\n", + " ...\n", + " [26. 13. 0. 0. 40.]\n", + " [29. 10. 0. 0. 50.]\n", + " [25. 9. 0. 0. 40.]]\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "sys.path.insert(0, os.path.abspath('..'))\n", + "from apt.anonymization import Anonymize\n", + "\n", + "# QI = (age, education-num, capital-gain, hours-per-week)\n", + "QI = [0, 1, 2, 4]\n", + "anonymizer = Anonymize(100, QI)\n", + "anon = anonymizer.anonymize(x_train, x_train_predictions)\n", + "print(anon)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6739" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# number of distinct rows in original data\n", + "len(np.unique(x_train, axis=0))" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "658" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# number of distinct rows in anonymized data\n", + "len(np.unique(anon, axis=0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train decision tree model" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Anonymized model accuracy: 0.8304158221239482\n" + ] + } + ], + "source": [ + "anon_model = DecisionTreeClassifier()\n", + "anon_model.fit(anon, y_train)\n", + "\n", + "anon_art_classifier = ScikitlearnDecisionTreeClassifier(anon_model)\n", + "\n", + "print('Anonymized model accuracy: ', anon_model.score(x_test, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attack\n", + "### Black-box attack" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5034393809114359\n" + ] + } + ], + "source": [ + "anon_bb_attack = MembershipInferenceBlackBox(anon_art_classifier, attack_model_type='rf')\n", + "\n", + "# train attack model\n", + "anon_bb_attack.fit(x_train[:attack_train_size], y_train[:attack_train_size],\n", + " x_test[:attack_test_size], y_test[:attack_test_size])\n", + "\n", + "# get inferred values\n", + "anon_inferred_train_bb = anon_bb_attack.infer(x_train[attack_train_size:], y_train[attack_train_size:])\n", + "anon_inferred_test_bb = anon_bb_attack.infer(x_test[attack_test_size:], y_test[attack_test_size:])\n", + "# check accuracy\n", + "anon_train_acc = np.sum(anon_inferred_train_bb) / len(anon_inferred_train_bb)\n", + "anon_test_acc = 1 - (np.sum(anon_inferred_test_bb) / len(anon_inferred_test_bb))\n", + "anon_acc = (anon_train_acc * len(anon_inferred_train_bb) + anon_test_acc * len(anon_inferred_test_bb)) / (len(anon_inferred_train_bb) + len(anon_inferred_test_bb))\n", + "print(anon_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Attack accuracy is reduced to 50% (eqiuvalent to random guessing)" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0.5298924372550654, 0.7806166318634075)\n", + "(0.5030507735890172, 0.5671293452892765)\n" + ] + } + ], + "source": [ + "def calc_precision_recall(predicted, actual, positive_value=1):\n", + " score = 0 # both predicted and actual are positive\n", + " num_positive_predicted = 0 # predicted positive\n", + " num_positive_actual = 0 # actual positive\n", + " for i in range(len(predicted)):\n", + " if predicted[i] == positive_value:\n", + " num_positive_predicted += 1\n", + " if actual[i] == positive_value:\n", + " num_positive_actual += 1\n", + " if predicted[i] == actual[i]:\n", + " if predicted[i] == positive_value:\n", + " score += 1\n", + " \n", + " if num_positive_predicted == 0:\n", + " precision = 1\n", + " else:\n", + " precision = score / num_positive_predicted # the fraction of predicted “Yes” responses that are correct\n", + " if num_positive_actual == 0:\n", + " recall = 1\n", + " else:\n", + " recall = score / num_positive_actual # the fraction of “Yes” responses that are predicted correctly\n", + "\n", + " return precision, recall\n", + "\n", + "# regular\n", + "print(calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n", + " np.concatenate((np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb))))))\n", + "# anon\n", + "print(calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n", + " np.concatenate((np.ones(len(anon_inferred_train_bb)), np.zeros(len(anon_inferred_test_bb))))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Precision and recall are also reduced." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ba69642 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +numpy==1.19.0 +pandas==1.1.0 +scipy==1.4.1 +scikit-learn==0.22.2 + +# testing +pytest==5.4.2 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py new file mode 100644 index 0000000..8c360fb --- /dev/null +++ b/tests/test_anonymizer.py @@ -0,0 +1,79 @@ +import pytest +import numpy as np +from sklearn.tree import DecisionTreeClassifier +from sklearn.preprocessing import OneHotEncoder + +from apt.anonymization import Anonymize +from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset + + +def test_anonymize_ndarray_iris(): + (x_train, y_train), _ = get_iris_dataset() + model = DecisionTreeClassifier() + model.fit(x_train, y_train) + pred = model.predict(x_train) + + k = 10 + QI = [0, 2] + anonymizer = Anonymize(k, QI) + anon = anonymizer.anonymize(x_train, pred) + + assert(len(np.unique(anon, axis=0)) < len(np.unique(x_train, axis=0))) + _, counts_elements = np.unique(anon[:, QI], return_counts=True) + assert (np.min(counts_elements) >= k) + assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all()) + + +def test_anonymize_pandas_adult(): + (x_train, y_train), _ = get_adult_dataset() + encoded = OneHotEncoder().fit_transform(x_train) + model = DecisionTreeClassifier() + model.fit(encoded, y_train) + pred = model.predict(encoded) + + k = 100 + QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'native-country'] + categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'native-country'] + anonymizer = Anonymize(k, QI, categorical_features=categorical_features) + anon = anonymizer.anonymize(x_train, pred) + + assert(anon.drop_duplicates().shape[0] < x_train.drop_duplicates().shape[0]) + assert (anon.loc[:, QI].value_counts().min() >= k) + assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1))) + + +def test_anonymize_pandas_nursery(): + (x_train, y_train), _ = get_nursery_dataset() + x_train = x_train.astype(str) + encoded = OneHotEncoder().fit_transform(x_train) + model = DecisionTreeClassifier() + model.fit(encoded, y_train) + pred = model.predict(encoded) + + k = 100 + QI = ["finance", "social", "health"] + categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children'] + anonymizer = Anonymize(k, QI, categorical_features=categorical_features) + anon = anonymizer.anonymize(x_train, pred) + + assert(anon.drop_duplicates().shape[0] < x_train.drop_duplicates().shape[0]) + assert (anon.loc[:, QI].value_counts().min() >= k) + assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1))) + + +def test_errors(): + with pytest.raises(ValueError): + Anonymize(1, [0, 2]) + with pytest.raises(ValueError): + Anonymize(2, []) + with pytest.raises(ValueError): + Anonymize(2, None) + anonymizer = Anonymize(10, [0, 2]) + (x_train, y_train), (x_test, y_test) = get_iris_dataset() + with pytest.raises(ValueError): + anonymizer.anonymize(x_train, y_test) + (x_train, y_train), _ = get_adult_dataset() + with pytest.raises(ValueError): + anonymizer.anonymize(x_train, y_train)