mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
Initial commit
This commit is contained in:
parent
d2de0726f4
commit
5665c2e79d
22 changed files with 2369 additions and 0 deletions
26
README.md
26
README.md
|
|
@ -1,2 +1,28 @@
|
||||||
# ai-privacy-toolkit
|
# ai-privacy-toolkit
|
||||||
|
<p align="center">
|
||||||
|
<img src="docs/images/logo with text.jpg?raw=true" width="467" title="ai-privacy-toolkit logo">
|
||||||
|
</p>
|
||||||
|
<br />
|
||||||
|
|
||||||
A toolkit for tools and techniques related to the privacy and compliance of AI models.
|
A toolkit for tools and techniques related to the privacy and compliance of AI models.
|
||||||
|
|
||||||
|
The first release of this toolkit contains a single module called [**anonymization**](apt/anonymization/README.md).
|
||||||
|
This module contains methods for anonymizing ML model training data, so that when
|
||||||
|
a model is retrained on the anonymized data, the model itself will also be considered
|
||||||
|
anonymous. This may help exempt the model from different obligations and restrictions
|
||||||
|
set out in data protection regulations such as GDPR, CCPA, etc.
|
||||||
|
|
||||||
|
Official ai-privacy-toolkit documentation: <add link to readthedocs>
|
||||||
|
|
||||||
|
**Related toolkits:**
|
||||||
|
|
||||||
|
[ai-minimization-toolkit](https://github.com/IBM/ai-minimization-toolkit): A toolkit for
|
||||||
|
reducing the amount of personal data needed to perform predictions with a machine learning model
|
||||||
|
|
||||||
|
[differential-privacy-library](https://github.com/IBM/differential-privacy-library): A
|
||||||
|
general-purpose library for experimenting with, investigating and developing applications in,
|
||||||
|
differential privacy.
|
||||||
|
|
||||||
|
[adversarial-robustness-toolbox](https://github.com/Trusted-AI/adversarial-robustness-toolbox):
|
||||||
|
A Python library for Machine Learning Security.
|
||||||
|
|
||||||
|
|
|
||||||
4
apt/__init__.py
Normal file
4
apt/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
from apt import anonymization
|
||||||
|
from apt import utils
|
||||||
|
|
||||||
|
__version__ = "0.0.1"
|
||||||
22
apt/anonymization/README.md
Normal file
22
apt/anonymization/README.md
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
# anonymization module
|
||||||
|
This module contains methods for anonymizing ML model training data, so that when
|
||||||
|
a model is retrained on the anonymized data, the model itself will also be considered
|
||||||
|
anonymous. This may help exempt the model from different obligations and restrictions
|
||||||
|
set out in data protection regulations such as GDPR, CCPA, etc.
|
||||||
|
|
||||||
|
The module contains methods that enable anonymizing training datasets in a manner that
|
||||||
|
is tailored to and guided by an existing, trained ML model. It uses the existing model's
|
||||||
|
predictions on the training data to train a second, anonymizer model, that eventually determines
|
||||||
|
the generalizations that will be applied to the training data. For more information about the
|
||||||
|
method see: https://arxiv.org/abs/2007.13086
|
||||||
|
|
||||||
|
Once the anonymized training data is returned, it can be used to retrain the model.
|
||||||
|
|
||||||
|
The following figure depicts the overall process:
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="../../docs/images/AI_Privacy_project2.jpg?raw=true" width="667" title="anonymization process">
|
||||||
|
</p>
|
||||||
|
<br />
|
||||||
|
|
||||||
|
|
||||||
17
apt/anonymization/__init__.py
Normal file
17
apt/anonymization/__init__.py
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
"""
|
||||||
|
Module providing ML anonymization.
|
||||||
|
|
||||||
|
This module contains methods for anonymizing ML model training data, so that when
|
||||||
|
a model is retrained on the anonymized data, the model itself will also be considered
|
||||||
|
anonymous. This may help exempt the model from different obligations and restrictions
|
||||||
|
set out in data protection regulations such as GDPR, CCPA, etc.
|
||||||
|
|
||||||
|
The module contains methods that enable anonymizing training datasets in a manner that
|
||||||
|
is tailored to and guided by an existing, trained ML model. It uses the existing model's
|
||||||
|
predictions on the training data to train a second, anonymizer model, that eventually determines
|
||||||
|
the generalizations that will be applied to the training data. For more information about the
|
||||||
|
method see: https://arxiv.org/abs/2007.13086
|
||||||
|
|
||||||
|
Once the anonymized training data is returned, it can be used to retrain the model.
|
||||||
|
"""
|
||||||
|
from apt.anonymization.anonymizer import Anonymize
|
||||||
165
apt/anonymization/anonymizer.py
Normal file
165
apt/anonymization/anonymizer.py
Normal file
|
|
@ -0,0 +1,165 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from scipy.spatial import distance
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
|
||||||
|
from typing import Union, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class Anonymize:
|
||||||
|
"""
|
||||||
|
Class for performing tailored, model-guided anonymization of training datasets for ML models.
|
||||||
|
|
||||||
|
Based on the implementation described in: https://arxiv.org/abs/2007.13086
|
||||||
|
"""
|
||||||
|
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list]=None):
|
||||||
|
"""
|
||||||
|
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
|
||||||
|
other (when looking at the quasi identifiers). Should be at least 2.
|
||||||
|
:param quasi_identifiers: The indexes of the features that need to be anonymized (these should be the features
|
||||||
|
that may directly, indirectly or in combination with additional data, identify an
|
||||||
|
individual).
|
||||||
|
:param categorical_features: The list of categorical features (should only be supplied when passing data as a
|
||||||
|
pandas dataframe.
|
||||||
|
"""
|
||||||
|
if k < 2:
|
||||||
|
raise ValueError("k should be a positive integer with a value of 2 or higher")
|
||||||
|
if not quasi_identifiers or len(quasi_identifiers) < 1:
|
||||||
|
raise ValueError("The list of quasi-identifiers cannot be empty")
|
||||||
|
|
||||||
|
self.k = k
|
||||||
|
self.quasi_identifiers = quasi_identifiers
|
||||||
|
self.categorical_features = categorical_features
|
||||||
|
|
||||||
|
def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \
|
||||||
|
-> Union[np.ndarray, pd.DataFrame]:
|
||||||
|
"""
|
||||||
|
Method for performing model-guided anonymization.
|
||||||
|
|
||||||
|
:param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and
|
||||||
|
categorical data.
|
||||||
|
:param y: The predictions of the original model on the training data.
|
||||||
|
:return: An array containing the anonymized training dataset.
|
||||||
|
"""
|
||||||
|
if type(x) == np.ndarray:
|
||||||
|
return self._anonymize_ndarray(x.copy(), y)
|
||||||
|
else: # pandas
|
||||||
|
if not self.categorical_features:
|
||||||
|
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
|
||||||
|
return self._anonymize_pandas(x.copy(), y)
|
||||||
|
|
||||||
|
def _anonymize_ndarray(self, x, y):
|
||||||
|
if x.shape[0] != y.shape[0]:
|
||||||
|
raise ValueError("x and y should have same number of rows")
|
||||||
|
x_anonymizer_train = x[:, self.quasi_identifiers]
|
||||||
|
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||||
|
self.anonymizer.fit(x_anonymizer_train, y)
|
||||||
|
cells_by_id = self._calculate_cells(x, x_anonymizer_train)
|
||||||
|
return self._anonymize_data_numpy(x, x_anonymizer_train, cells_by_id)
|
||||||
|
|
||||||
|
def _anonymize_pandas(self, x, y):
|
||||||
|
if x.shape[0] != y.shape[0]:
|
||||||
|
raise ValueError("x and y should have same number of rows")
|
||||||
|
x_anonymizer_train = x.loc[:, self.quasi_identifiers]
|
||||||
|
# need to one-hot encode before training the decision tree
|
||||||
|
x_prepared = self._modify_categorical_features(x_anonymizer_train)
|
||||||
|
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||||
|
self.anonymizer.fit(x_prepared, y)
|
||||||
|
cells_by_id = self._calculate_cells(x, x_prepared)
|
||||||
|
return self._anonymize_data_pandas(x, x_prepared, cells_by_id)
|
||||||
|
|
||||||
|
def _calculate_cells(self, x, x_anonymizer_train):
|
||||||
|
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
|
||||||
|
cells_by_id = {}
|
||||||
|
leaves = []
|
||||||
|
for node, feature in enumerate(self.anonymizer.tree_.feature):
|
||||||
|
if feature == -2: # leaf node
|
||||||
|
leaves.append(node)
|
||||||
|
hist = [int(i) for i in self.anonymizer.tree_.value[node][0]]
|
||||||
|
label_hist = self.anonymizer.tree_.value[node][0]
|
||||||
|
label = int(self.anonymizer.classes_[np.argmax(label_hist)])
|
||||||
|
cell = {'label': label, 'hist': hist, 'id': int(node)}
|
||||||
|
cells_by_id[cell['id']] = cell
|
||||||
|
self.nodes = leaves
|
||||||
|
self._find_representatives(x, x_anonymizer_train, cells_by_id.values())
|
||||||
|
return cells_by_id
|
||||||
|
|
||||||
|
def _find_representatives(self, x, x_anonymizer_train, cells):
|
||||||
|
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
|
||||||
|
node_ids = self._find_sample_nodes(x_anonymizer_train)
|
||||||
|
for cell in cells:
|
||||||
|
cell['representative'] = {}
|
||||||
|
# get all rows in cell
|
||||||
|
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
|
||||||
|
# TODO: should we filter only those with majority label? (using hist)
|
||||||
|
if type(x) == np.ndarray:
|
||||||
|
rows = x[indexes]
|
||||||
|
else: # pandas
|
||||||
|
rows = x.iloc[indexes]
|
||||||
|
for feature in self.quasi_identifiers:
|
||||||
|
if type(x) == np.ndarray:
|
||||||
|
values = rows[:, feature]
|
||||||
|
else: # pandas
|
||||||
|
values = rows.loc[:, feature]
|
||||||
|
if self.categorical_features and feature in self.categorical_features:
|
||||||
|
# find most common value
|
||||||
|
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
|
||||||
|
else:
|
||||||
|
# find the mean value (per feature)
|
||||||
|
median = np.median(values)
|
||||||
|
min_value = max(values)
|
||||||
|
min_dist = float("inf")
|
||||||
|
for value in values:
|
||||||
|
dist = distance.euclidean(value, median)
|
||||||
|
if dist < min_dist:
|
||||||
|
min_dist = dist
|
||||||
|
min_value = value
|
||||||
|
cell['representative'][feature] = min_value
|
||||||
|
|
||||||
|
def _find_sample_nodes(self, samples):
|
||||||
|
paths = self.anonymizer.decision_path(samples).toarray()
|
||||||
|
node_set = set(self.nodes)
|
||||||
|
return [(list(set([i for i, v in enumerate(p) if v == 1]) & node_set))[0] for p in paths]
|
||||||
|
|
||||||
|
def _find_sample_cells(self, samples, cells_by_id):
|
||||||
|
node_ids = self._find_sample_nodes(samples)
|
||||||
|
return [cells_by_id[node_id] for node_id in node_ids]
|
||||||
|
|
||||||
|
def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
|
||||||
|
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
|
||||||
|
index = 0
|
||||||
|
for row in x:
|
||||||
|
cell = cells[index]
|
||||||
|
index += 1
|
||||||
|
for feature in cell['representative']:
|
||||||
|
row[feature] = cell['representative'][feature]
|
||||||
|
return x
|
||||||
|
|
||||||
|
def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id):
|
||||||
|
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
|
||||||
|
index = 0
|
||||||
|
for i, row in x.iterrows():
|
||||||
|
cell = cells[index]
|
||||||
|
index += 1
|
||||||
|
for feature in cell['representative']:
|
||||||
|
x.at[i, feature] = cell['representative'][feature]
|
||||||
|
return x
|
||||||
|
|
||||||
|
def _modify_categorical_features(self, x): # only for pandas
|
||||||
|
self.categorical_values = {}
|
||||||
|
self.one_hot_to_features = {}
|
||||||
|
features_to_remove = []
|
||||||
|
for feature in self.categorical_features:
|
||||||
|
if feature in self.quasi_identifiers:
|
||||||
|
all_values = x.loc[:, feature]
|
||||||
|
values = list(all_values.unique())
|
||||||
|
self.categorical_values[feature] = values
|
||||||
|
x[feature] = pd.Categorical(x.loc[:, feature], categories=values, ordered=False)
|
||||||
|
one_hot_vector = pd.get_dummies(x[feature], prefix=feature)
|
||||||
|
for one_hot_vector_feature in one_hot_vector.columns:
|
||||||
|
self.one_hot_to_features[one_hot_vector_feature] = feature
|
||||||
|
x = pd.concat([x, one_hot_vector], axis=1)
|
||||||
|
features_to_remove.append(feature)
|
||||||
|
return x.drop(features_to_remove, axis=1)
|
||||||
219
apt/utils.py
Normal file
219
apt/utils.py
Normal file
|
|
@ -0,0 +1,219 @@
|
||||||
|
from sklearn import datasets, model_selection
|
||||||
|
import sklearn.preprocessing
|
||||||
|
import pandas as pd
|
||||||
|
import ssl
|
||||||
|
from os import path
|
||||||
|
from six.moves.urllib.request import urlretrieve
|
||||||
|
|
||||||
|
|
||||||
|
def _load_iris(test_set_size: float=0.3):
|
||||||
|
iris = datasets.load_iris()
|
||||||
|
data = iris.data
|
||||||
|
labels = iris.target
|
||||||
|
|
||||||
|
# Split training and test sets
|
||||||
|
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
|
||||||
|
random_state=18, stratify=labels,
|
||||||
|
shuffle=True)
|
||||||
|
|
||||||
|
return (x_train, y_train), (x_test, y_test)
|
||||||
|
|
||||||
|
|
||||||
|
def get_iris_dataset():
|
||||||
|
"""
|
||||||
|
Loads the Iris dataset from scikit-learn.
|
||||||
|
|
||||||
|
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
|
||||||
|
:return: Entire dataset and labels as numpy array.
|
||||||
|
"""
|
||||||
|
return _load_iris()
|
||||||
|
|
||||||
|
|
||||||
|
def get_adult_dataset():
|
||||||
|
"""
|
||||||
|
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary.
|
||||||
|
|
||||||
|
:return: Dataset and labels as pandas dataframes.
|
||||||
|
"""
|
||||||
|
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
|
||||||
|
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
|
||||||
|
'label']
|
||||||
|
train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
|
||||||
|
test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
|
||||||
|
train_file = '../datasets/adult/train'
|
||||||
|
test_file = '../datasets/adult/test'
|
||||||
|
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
if not path.exists(train_file):
|
||||||
|
urlretrieve(train_url, train_file)
|
||||||
|
if not path.exists(test_file):
|
||||||
|
urlretrieve(test_url, test_file)
|
||||||
|
|
||||||
|
train = pd.read_csv(train_file, sep=', ', names=features, engine='python')
|
||||||
|
test = pd.read_csv(test_file, sep=', ', names=features, engine='python')
|
||||||
|
test = test.iloc[1:]
|
||||||
|
|
||||||
|
train = _modify_adult_dataset(train)
|
||||||
|
test = _modify_adult_dataset(test)
|
||||||
|
|
||||||
|
x_train = train.drop(['label'], axis=1)
|
||||||
|
y_train = train.loc[:, 'label']
|
||||||
|
x_test = test.drop(['label'], axis=1)
|
||||||
|
y_test = test.loc[:, 'label']
|
||||||
|
|
||||||
|
return (x_train, y_train), (x_test, y_test)
|
||||||
|
|
||||||
|
|
||||||
|
def _modify_adult_dataset(data):
|
||||||
|
def modify_label(value):
|
||||||
|
if value == '<=50K.' or value == '<=50K':
|
||||||
|
return 0
|
||||||
|
elif value == '>50K.' or value == '>50K':
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
raise Exception('Bad label value')
|
||||||
|
|
||||||
|
def modify_native_country(value):
|
||||||
|
Euro_1 = ['Italy', 'Holand-Netherlands', 'Germany', 'France']
|
||||||
|
Euro_2 = ['Yugoslavia', 'South', 'Portugal', 'Poland', 'Hungary', 'Greece']
|
||||||
|
SE_Asia = ['Vietnam', 'Thailand', 'Philippines', 'Laos', 'Cambodia']
|
||||||
|
UnitedStates = ['United-States']
|
||||||
|
LatinAmerica = ['Trinadad&Tobago', 'Puerto-Rico', 'Outlying-US(Guam-USVI-etc)', 'Nicaragua', 'Mexico',
|
||||||
|
'Jamaica', 'Honduras', 'Haiti', 'Guatemala', 'Dominican-Republic']
|
||||||
|
China = ['Taiwan', 'Hong', 'China']
|
||||||
|
BritishCommonwealth = ['Scotland', 'Ireland', 'India', 'England', 'Canada']
|
||||||
|
SouthAmerica = ['Peru', 'El-Salvador', 'Ecuador', 'Columbia']
|
||||||
|
Other = ['Japan', 'Iran', 'Cuba']
|
||||||
|
|
||||||
|
if value in Euro_1:
|
||||||
|
return 'Euro_1'
|
||||||
|
elif value in Euro_2:
|
||||||
|
return 'Euro_2'
|
||||||
|
elif value in SE_Asia:
|
||||||
|
return 'SE_Asia'
|
||||||
|
elif value in UnitedStates:
|
||||||
|
return 'UnitedStates'
|
||||||
|
elif value in LatinAmerica:
|
||||||
|
return 'LatinAmerica'
|
||||||
|
elif value in China:
|
||||||
|
return 'China'
|
||||||
|
elif value in BritishCommonwealth:
|
||||||
|
return 'BritishCommonwealth'
|
||||||
|
elif value in SouthAmerica:
|
||||||
|
return 'SouthAmerica'
|
||||||
|
elif value in Other:
|
||||||
|
return 'Other'
|
||||||
|
elif value == '?':
|
||||||
|
return 'Unknown'
|
||||||
|
else:
|
||||||
|
raise Exception('Bad native country value')
|
||||||
|
|
||||||
|
data['label'] = data['label'].apply(modify_label)
|
||||||
|
data['native-country'] = data['native-country'].apply(modify_native_country)
|
||||||
|
|
||||||
|
for col in ('age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'):
|
||||||
|
try:
|
||||||
|
data[col] = data[col].fillna(0)
|
||||||
|
except KeyError:
|
||||||
|
print('missing column ' + col)
|
||||||
|
|
||||||
|
for col in ('workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'):
|
||||||
|
try:
|
||||||
|
data[col] = data[col].fillna('NA')
|
||||||
|
except KeyError:
|
||||||
|
print('missing column ' + col)
|
||||||
|
|
||||||
|
return data.drop(['fnlwgt', 'education'], axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
|
||||||
|
"""
|
||||||
|
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it if necessary.
|
||||||
|
|
||||||
|
:param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot
|
||||||
|
encoded and data is scaled using sklearn's StandardScaler.
|
||||||
|
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1.
|
||||||
|
:param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute
|
||||||
|
inference. This is done by assigning the original value 'problematic' the new value 1, and
|
||||||
|
the other original values are assigned the new value 0.
|
||||||
|
:return: Dataset and labels as pandas dataframes.
|
||||||
|
"""
|
||||||
|
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data'
|
||||||
|
data_file = '../datasets/nursery/data'
|
||||||
|
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
if not path.exists(data_file):
|
||||||
|
urlretrieve(url, data_file)
|
||||||
|
|
||||||
|
# load data
|
||||||
|
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health", "label"]
|
||||||
|
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]
|
||||||
|
data = pd.read_csv(data_file, sep=",", names=features, engine="python")
|
||||||
|
# remove rows with missing label or too sparse label
|
||||||
|
data = data.dropna(subset=["label"])
|
||||||
|
data.drop(data.loc[data["label"] == "recommend"].index, axis=0, inplace=True)
|
||||||
|
|
||||||
|
# fill missing values
|
||||||
|
data["children"] = data["children"].fillna(0)
|
||||||
|
|
||||||
|
for col in ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]:
|
||||||
|
data[col] = data[col].fillna("other")
|
||||||
|
|
||||||
|
# make categorical label
|
||||||
|
def modify_label(value): # 5 classes
|
||||||
|
if value == "not_recom":
|
||||||
|
return 0
|
||||||
|
elif value == "very_recom":
|
||||||
|
return 1
|
||||||
|
elif value == "priority":
|
||||||
|
return 2
|
||||||
|
elif value == "spec_prior":
|
||||||
|
return 3
|
||||||
|
else:
|
||||||
|
raise Exception("Bad label value: %s" % value)
|
||||||
|
|
||||||
|
data["label"] = data["label"].apply(modify_label)
|
||||||
|
data["children"] = data["children"].apply(lambda x: 4 if x == "more" else x)
|
||||||
|
|
||||||
|
if transform_social:
|
||||||
|
|
||||||
|
def modify_social(value):
|
||||||
|
if value == "problematic":
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
data["social"] = data["social"].apply(modify_social)
|
||||||
|
categorical_features.remove("social")
|
||||||
|
|
||||||
|
if not raw:
|
||||||
|
# one-hot-encode categorical features
|
||||||
|
features_to_remove = []
|
||||||
|
for feature in categorical_features:
|
||||||
|
all_values = data.loc[:, feature]
|
||||||
|
values = list(all_values.unique())
|
||||||
|
data[feature] = pd.Categorical(data.loc[:, feature], categories=values, ordered=False)
|
||||||
|
one_hot_vector = pd.get_dummies(data[feature], prefix=feature)
|
||||||
|
data = pd.concat([data, one_hot_vector], axis=1)
|
||||||
|
features_to_remove.append(feature)
|
||||||
|
data = data.drop(features_to_remove, axis=1)
|
||||||
|
|
||||||
|
# normalize data
|
||||||
|
label = data.loc[:, "label"]
|
||||||
|
features = data.drop(["label"], axis=1)
|
||||||
|
scaler = sklearn.preprocessing.StandardScaler()
|
||||||
|
scaler.fit(features)
|
||||||
|
scaled_features = pd.DataFrame(scaler.transform(features), columns=features.columns)
|
||||||
|
data = pd.concat([label, scaled_features], axis=1, join="inner")
|
||||||
|
|
||||||
|
# Split training and test sets
|
||||||
|
stratified = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_set, random_state=18)
|
||||||
|
for train_set, test_set in stratified.split(data, data["label"]):
|
||||||
|
train = data.iloc[train_set]
|
||||||
|
test = data.iloc[test_set]
|
||||||
|
x_train = train.drop(["label"], axis=1)
|
||||||
|
y_train = train.loc[:, "label"]
|
||||||
|
x_test = test.drop(["label"], axis=1)
|
||||||
|
y_test = test.loc[:, "label"]
|
||||||
|
|
||||||
|
return (x_train, y_train), (x_test, y_test)
|
||||||
20
docs/Makefile
Normal file
20
docs/Makefile
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Minimal makefile for Sphinx documentation
|
||||||
|
#
|
||||||
|
|
||||||
|
# You can set these variables from the command line, and also
|
||||||
|
# from the environment for the first two.
|
||||||
|
SPHINXOPTS ?=
|
||||||
|
SPHINXBUILD ?= sphinx-build
|
||||||
|
SOURCEDIR = source
|
||||||
|
BUILDDIR = build
|
||||||
|
|
||||||
|
# Put it first so that "make" without argument is like "make help".
|
||||||
|
help:
|
||||||
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
|
.PHONY: help Makefile
|
||||||
|
|
||||||
|
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||||
|
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||||
|
%: Makefile
|
||||||
|
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
BIN
docs/images/AI_Privacy_project2.jpg
Normal file
BIN
docs/images/AI_Privacy_project2.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 95 KiB |
BIN
docs/images/logo with text.jpg
Normal file
BIN
docs/images/logo with text.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 500 KiB |
35
docs/make.bat
Normal file
35
docs/make.bat
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
@ECHO OFF
|
||||||
|
|
||||||
|
pushd %~dp0
|
||||||
|
|
||||||
|
REM Command file for Sphinx documentation
|
||||||
|
|
||||||
|
if "%SPHINXBUILD%" == "" (
|
||||||
|
set SPHINXBUILD=sphinx-build
|
||||||
|
)
|
||||||
|
set SOURCEDIR=source
|
||||||
|
set BUILDDIR=build
|
||||||
|
|
||||||
|
if "%1" == "" goto help
|
||||||
|
|
||||||
|
%SPHINXBUILD% >NUL 2>NUL
|
||||||
|
if errorlevel 9009 (
|
||||||
|
echo.
|
||||||
|
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||||
|
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||||
|
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||||
|
echo.may add the Sphinx directory to PATH.
|
||||||
|
echo.
|
||||||
|
echo.If you don't have Sphinx installed, grab it from
|
||||||
|
echo.http://sphinx-doc.org/
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
goto end
|
||||||
|
|
||||||
|
:help
|
||||||
|
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
|
||||||
|
:end
|
||||||
|
popd
|
||||||
22
docs/source/apt.anonymization.rst
Normal file
22
docs/source/apt.anonymization.rst
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
apt.anonymization package
|
||||||
|
=========================
|
||||||
|
|
||||||
|
Submodules
|
||||||
|
----------
|
||||||
|
|
||||||
|
apt.anonymization.anonymizer module
|
||||||
|
-----------------------------------
|
||||||
|
|
||||||
|
.. automodule:: apt.anonymization.anonymizer
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
|
||||||
|
|
||||||
|
Module contents
|
||||||
|
---------------
|
||||||
|
|
||||||
|
.. automodule:: apt.anonymization
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
18
docs/source/apt.rst
Normal file
18
docs/source/apt.rst
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
apt package
|
||||||
|
===========
|
||||||
|
|
||||||
|
Subpackages
|
||||||
|
-----------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 4
|
||||||
|
|
||||||
|
apt.anonymization
|
||||||
|
|
||||||
|
Module contents
|
||||||
|
---------------
|
||||||
|
|
||||||
|
.. automodule:: apt
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
57
docs/source/conf.py
Normal file
57
docs/source/conf.py
Normal file
|
|
@ -0,0 +1,57 @@
|
||||||
|
# Configuration file for the Sphinx documentation builder.
|
||||||
|
#
|
||||||
|
# This file only contains a selection of the most common options. For a full
|
||||||
|
# list see the documentation:
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||||
|
|
||||||
|
# -- Path setup --------------------------------------------------------------
|
||||||
|
|
||||||
|
# If extensions (or modules to document with autodoc) are in another directory,
|
||||||
|
# add these directories to sys.path here. If the directory is relative to the
|
||||||
|
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||||
|
#
|
||||||
|
# import os
|
||||||
|
# import sys
|
||||||
|
# sys.path.insert(0, os.path.abspath('.'))
|
||||||
|
|
||||||
|
|
||||||
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
|
project = 'ai-privacy-toolkit'
|
||||||
|
copyright = '2021, Abigail Goldsteen'
|
||||||
|
author = 'Abigail Goldsteen'
|
||||||
|
|
||||||
|
# The full version, including alpha/beta/rc tags
|
||||||
|
release = '0.0.1'
|
||||||
|
|
||||||
|
|
||||||
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|
||||||
|
# Add any Sphinx extension module names here, as strings. They can be
|
||||||
|
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||||
|
# ones.
|
||||||
|
extensions = [
|
||||||
|
'sphinx.ext.autodoc',
|
||||||
|
'sphinx.ext.napoleon'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
|
templates_path = ['_templates']
|
||||||
|
|
||||||
|
# List of patterns, relative to source directory, that match files and
|
||||||
|
# directories to ignore when looking for source files.
|
||||||
|
# This pattern also affects html_static_path and html_extra_path.
|
||||||
|
exclude_patterns = []
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for HTML output -------------------------------------------------
|
||||||
|
|
||||||
|
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||||
|
# a list of builtin themes.
|
||||||
|
#
|
||||||
|
html_theme = 'alabaster'
|
||||||
|
|
||||||
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
|
html_static_path = ['_static']
|
||||||
37
docs/source/index.rst
Normal file
37
docs/source/index.rst
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
.. ai-privacy-toolkit documentation master file, created by
|
||||||
|
sphinx-quickstart on Mon Feb 15 12:42:20 2021.
|
||||||
|
You can adapt this file completely to your liking, but it should at least
|
||||||
|
contain the root `toctree` directive.
|
||||||
|
|
||||||
|
Welcome to ai-privacy-toolkit's documentation!
|
||||||
|
==============================================
|
||||||
|
|
||||||
|
This project provides tools for assessing and improving the privacy and compliance of AI models.
|
||||||
|
|
||||||
|
The first release of this toolkit contains a single module called anonymization. This
|
||||||
|
module contains methods for anonymizing ML model training data, so that when
|
||||||
|
a model is retrained on the anonymized data, the model itself will also be considered
|
||||||
|
anonymous. This may help exempt the model from different obligations and restrictions
|
||||||
|
set out in data protection regulations such as GDPR, CCPA, etc.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
:caption: Getting Started:
|
||||||
|
|
||||||
|
quick_start
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
:hidden:
|
||||||
|
:caption: API
|
||||||
|
|
||||||
|
apt
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Indices and tables
|
||||||
|
==================
|
||||||
|
|
||||||
|
* :ref:`genindex`
|
||||||
|
* :ref:`modindex`
|
||||||
|
* :ref:`search`
|
||||||
8
docs/source/modules.rst
Normal file
8
docs/source/modules.rst
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
ai-privacy-toolkit
|
||||||
|
==================
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 4
|
||||||
|
|
||||||
|
apt
|
||||||
|
tests
|
||||||
16
docs/source/quick_start.rst
Normal file
16
docs/source/quick_start.rst
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
###########################################
|
||||||
|
Getting started with the AI Privacy toolkit
|
||||||
|
###########################################
|
||||||
|
|
||||||
|
|
||||||
|
Download the toolkit code:
|
||||||
|
==========================
|
||||||
|
|
||||||
|
Clone the ``ai-minimization-toolkit`` repository::
|
||||||
|
|
||||||
|
$ git clone https://github.com/IBM/ai-privacy-toolkit.git
|
||||||
|
|
||||||
|
Or download using pip::
|
||||||
|
|
||||||
|
pip install ai-privacy-toolkit==0.0.1
|
||||||
|
|
||||||
30
docs/source/tests.rst
Normal file
30
docs/source/tests.rst
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
tests package
|
||||||
|
=============
|
||||||
|
|
||||||
|
Submodules
|
||||||
|
----------
|
||||||
|
|
||||||
|
tests.test\_anonymizer module
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
.. automodule:: tests.test_anonymizer
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
|
||||||
|
tests.utils module
|
||||||
|
------------------
|
||||||
|
|
||||||
|
.. automodule:: tests.utils
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
|
|
||||||
|
|
||||||
|
Module contents
|
||||||
|
---------------
|
||||||
|
|
||||||
|
.. automodule:: tests
|
||||||
|
:members:
|
||||||
|
:undoc-members:
|
||||||
|
:show-inheritance:
|
||||||
1165
notebooks/attribute_inference_anonymization_nursery.ipynb
Normal file
1165
notebooks/attribute_inference_anonymization_nursery.ipynb
Normal file
File diff suppressed because it is too large
Load diff
422
notebooks/membership_inference_anonymization_adult.ipynb
Normal file
422
notebooks/membership_inference_anonymization_adult.ipynb
Normal file
|
|
@ -0,0 +1,422 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Using ML anonymization to defend against membership inference attacks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In this tutorial we will show how to anonymize models using the ML anonymization module. \n",
|
||||||
|
"\n",
|
||||||
|
"We will demonstrate running inference attacks both on a vanilla model, and then on an anonymized version of the model. We will run a black-box membership inference attack using ART's inference module (https://github.com/Trusted-AI/adversarial-robustness-toolbox/tree/main/art/attacks/inference). \n",
|
||||||
|
"\n",
|
||||||
|
"This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/nursery). \n",
|
||||||
|
"\n",
|
||||||
|
"For simplicity, we used only the numerical features in the dataset."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Load data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 97,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[ 39. 13. 2174. 0. 40.]\n",
|
||||||
|
" [ 50. 13. 0. 0. 13.]\n",
|
||||||
|
" [ 38. 9. 0. 0. 40.]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [ 27. 13. 0. 0. 40.]\n",
|
||||||
|
" [ 26. 11. 0. 0. 48.]\n",
|
||||||
|
" [ 27. 9. 0. 0. 40.]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
|
||||||
|
"x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
|
||||||
|
" usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n",
|
||||||
|
"\n",
|
||||||
|
"y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
|
||||||
|
" usecols=14, dtype=str, delimiter=\", \")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
|
||||||
|
" usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n",
|
||||||
|
"\n",
|
||||||
|
"y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
|
||||||
|
" usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n",
|
||||||
|
"\n",
|
||||||
|
"# Trim trailing period \".\" from label\n",
|
||||||
|
"y_test = np.array([a[:-1] for a in y_test])\n",
|
||||||
|
"\n",
|
||||||
|
"y_train[y_train == '<=50K'] = 0\n",
|
||||||
|
"y_train[y_train == '>50K'] = 1\n",
|
||||||
|
"y_train = y_train.astype(np.int)\n",
|
||||||
|
"\n",
|
||||||
|
"y_test[y_test == '<=50K'] = 0\n",
|
||||||
|
"y_test[y_test == '>50K'] = 1\n",
|
||||||
|
"y_test = y_test.astype(np.int)\n",
|
||||||
|
"\n",
|
||||||
|
"# get balanced dataset\n",
|
||||||
|
"x_train = x_train[:x_test.shape[0]]\n",
|
||||||
|
"y_train = y_train[:y_test.shape[0]]\n",
|
||||||
|
"\n",
|
||||||
|
"print(x_train)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Train decision tree model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 116,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Base model accuracy: 0.8075056814691972\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||||
|
"from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier\n",
|
||||||
|
"\n",
|
||||||
|
"model = DecisionTreeClassifier()\n",
|
||||||
|
"model.fit(x_train, y_train)\n",
|
||||||
|
"\n",
|
||||||
|
"art_classifier = ScikitlearnDecisionTreeClassifier(model)\n",
|
||||||
|
"\n",
|
||||||
|
"print('Base model accuracy: ', model.score(x_test, y_test))\n",
|
||||||
|
"\n",
|
||||||
|
"x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)]).reshape(-1,1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Attack\n",
|
||||||
|
"The black-box attack basically trains an additional classifier (called the attack model) to predict the membership status of a sample.\n",
|
||||||
|
"#### Train attack model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 124,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from art.attacks.inference.membership_inference import MembershipInferenceBlackBox\n",
|
||||||
|
"\n",
|
||||||
|
"# attack_model_type can be nn (neural network), rf (randon forest) or gb (gradient boosting)\n",
|
||||||
|
"bb_attack = MembershipInferenceBlackBox(art_classifier, attack_model_type='rf')\n",
|
||||||
|
"\n",
|
||||||
|
"# use half of each dataset for training the attack\n",
|
||||||
|
"attack_train_ratio = 0.5\n",
|
||||||
|
"attack_train_size = int(len(x_train) * attack_train_ratio)\n",
|
||||||
|
"attack_test_size = int(len(x_test) * attack_train_ratio)\n",
|
||||||
|
"\n",
|
||||||
|
"# train attack model\n",
|
||||||
|
"bb_attack.fit(x_train[:attack_train_size], y_train[:attack_train_size],\n",
|
||||||
|
" x_test[:attack_test_size], y_test[:attack_test_size])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Infer sensitive feature and check accuracy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 125,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0.5440363591696352\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# get inferred values for remaining half\n",
|
||||||
|
"inferred_train_bb = bb_attack.infer(x_train[attack_train_size:], y_train[attack_train_size:])\n",
|
||||||
|
"inferred_test_bb = bb_attack.infer(x_test[attack_test_size:], y_test[attack_test_size:])\n",
|
||||||
|
"# check accuracy\n",
|
||||||
|
"train_acc = np.sum(inferred_train_bb) / len(inferred_train_bb)\n",
|
||||||
|
"test_acc = 1 - (np.sum(inferred_test_bb) / len(inferred_test_bb))\n",
|
||||||
|
"acc = (train_acc * len(inferred_train_bb) + test_acc * len(inferred_test_bb)) / (len(inferred_train_bb) + len(inferred_test_bb))\n",
|
||||||
|
"print(acc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This means that for 54% of the data, membership is inferred correctly using this attack."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Anonymized data\n",
|
||||||
|
"## k=100\n",
|
||||||
|
"\n",
|
||||||
|
"Now we will apply the same attacks on an anonymized version of the same dataset (k=100). The data is anonymized on the quasi-identifiers: age, education-num, capital-gain, hours-per-week.\n",
|
||||||
|
"\n",
|
||||||
|
"k=100 means that each record in the anonymized dataset is identical to 99 others on the quasi-identifier values (i.e., when looking only at those features, the records are indistinguishable)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 128,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[38. 13. 0. 0. 40.]\n",
|
||||||
|
" [57. 13. 0. 0. 30.]\n",
|
||||||
|
" [37. 9. 0. 0. 40.]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [26. 13. 0. 0. 40.]\n",
|
||||||
|
" [29. 10. 0. 0. 50.]\n",
|
||||||
|
" [25. 9. 0. 0. 40.]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import sys\n",
|
||||||
|
"sys.path.insert(0, os.path.abspath('..'))\n",
|
||||||
|
"from apt.anonymization import Anonymize\n",
|
||||||
|
"\n",
|
||||||
|
"# QI = (age, education-num, capital-gain, hours-per-week)\n",
|
||||||
|
"QI = [0, 1, 2, 4]\n",
|
||||||
|
"anonymizer = Anonymize(100, QI)\n",
|
||||||
|
"anon = anonymizer.anonymize(x_train, x_train_predictions)\n",
|
||||||
|
"print(anon)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 104,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"6739"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 104,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# number of distinct rows in original data\n",
|
||||||
|
"len(np.unique(x_train, axis=0))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 129,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"658"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 129,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# number of distinct rows in anonymized data\n",
|
||||||
|
"len(np.unique(anon, axis=0))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Train decision tree model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 130,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Anonymized model accuracy: 0.8304158221239482\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"anon_model = DecisionTreeClassifier()\n",
|
||||||
|
"anon_model.fit(anon, y_train)\n",
|
||||||
|
"\n",
|
||||||
|
"anon_art_classifier = ScikitlearnDecisionTreeClassifier(anon_model)\n",
|
||||||
|
"\n",
|
||||||
|
"print('Anonymized model accuracy: ', anon_model.score(x_test, y_test))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Attack\n",
|
||||||
|
"### Black-box attack"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 131,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0.5034393809114359\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"anon_bb_attack = MembershipInferenceBlackBox(anon_art_classifier, attack_model_type='rf')\n",
|
||||||
|
"\n",
|
||||||
|
"# train attack model\n",
|
||||||
|
"anon_bb_attack.fit(x_train[:attack_train_size], y_train[:attack_train_size],\n",
|
||||||
|
" x_test[:attack_test_size], y_test[:attack_test_size])\n",
|
||||||
|
"\n",
|
||||||
|
"# get inferred values\n",
|
||||||
|
"anon_inferred_train_bb = anon_bb_attack.infer(x_train[attack_train_size:], y_train[attack_train_size:])\n",
|
||||||
|
"anon_inferred_test_bb = anon_bb_attack.infer(x_test[attack_test_size:], y_test[attack_test_size:])\n",
|
||||||
|
"# check accuracy\n",
|
||||||
|
"anon_train_acc = np.sum(anon_inferred_train_bb) / len(anon_inferred_train_bb)\n",
|
||||||
|
"anon_test_acc = 1 - (np.sum(anon_inferred_test_bb) / len(anon_inferred_test_bb))\n",
|
||||||
|
"anon_acc = (anon_train_acc * len(anon_inferred_train_bb) + anon_test_acc * len(anon_inferred_test_bb)) / (len(anon_inferred_train_bb) + len(anon_inferred_test_bb))\n",
|
||||||
|
"print(anon_acc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Attack accuracy is reduced to 50% (eqiuvalent to random guessing)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 132,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(0.5298924372550654, 0.7806166318634075)\n",
|
||||||
|
"(0.5030507735890172, 0.5671293452892765)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"def calc_precision_recall(predicted, actual, positive_value=1):\n",
|
||||||
|
" score = 0 # both predicted and actual are positive\n",
|
||||||
|
" num_positive_predicted = 0 # predicted positive\n",
|
||||||
|
" num_positive_actual = 0 # actual positive\n",
|
||||||
|
" for i in range(len(predicted)):\n",
|
||||||
|
" if predicted[i] == positive_value:\n",
|
||||||
|
" num_positive_predicted += 1\n",
|
||||||
|
" if actual[i] == positive_value:\n",
|
||||||
|
" num_positive_actual += 1\n",
|
||||||
|
" if predicted[i] == actual[i]:\n",
|
||||||
|
" if predicted[i] == positive_value:\n",
|
||||||
|
" score += 1\n",
|
||||||
|
" \n",
|
||||||
|
" if num_positive_predicted == 0:\n",
|
||||||
|
" precision = 1\n",
|
||||||
|
" else:\n",
|
||||||
|
" precision = score / num_positive_predicted # the fraction of predicted “Yes” responses that are correct\n",
|
||||||
|
" if num_positive_actual == 0:\n",
|
||||||
|
" recall = 1\n",
|
||||||
|
" else:\n",
|
||||||
|
" recall = score / num_positive_actual # the fraction of “Yes” responses that are predicted correctly\n",
|
||||||
|
"\n",
|
||||||
|
" return precision, recall\n",
|
||||||
|
"\n",
|
||||||
|
"# regular\n",
|
||||||
|
"print(calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n",
|
||||||
|
" np.concatenate((np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb))))))\n",
|
||||||
|
"# anon\n",
|
||||||
|
"print(calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n",
|
||||||
|
" np.concatenate((np.ones(len(anon_inferred_train_bb)), np.zeros(len(anon_inferred_test_bb))))))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Precision and recall are also reduced."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
7
requirements.txt
Normal file
7
requirements.txt
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
numpy==1.19.0
|
||||||
|
pandas==1.1.0
|
||||||
|
scipy==1.4.1
|
||||||
|
scikit-learn==0.22.2
|
||||||
|
|
||||||
|
# testing
|
||||||
|
pytest==5.4.2
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
79
tests/test_anonymizer.py
Normal file
79
tests/test_anonymizer.py
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
|
|
||||||
|
from apt.anonymization import Anonymize
|
||||||
|
from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
|
||||||
|
|
||||||
|
|
||||||
|
def test_anonymize_ndarray_iris():
|
||||||
|
(x_train, y_train), _ = get_iris_dataset()
|
||||||
|
model = DecisionTreeClassifier()
|
||||||
|
model.fit(x_train, y_train)
|
||||||
|
pred = model.predict(x_train)
|
||||||
|
|
||||||
|
k = 10
|
||||||
|
QI = [0, 2]
|
||||||
|
anonymizer = Anonymize(k, QI)
|
||||||
|
anon = anonymizer.anonymize(x_train, pred)
|
||||||
|
|
||||||
|
assert(len(np.unique(anon, axis=0)) < len(np.unique(x_train, axis=0)))
|
||||||
|
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
|
||||||
|
assert (np.min(counts_elements) >= k)
|
||||||
|
assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
|
||||||
|
|
||||||
|
|
||||||
|
def test_anonymize_pandas_adult():
|
||||||
|
(x_train, y_train), _ = get_adult_dataset()
|
||||||
|
encoded = OneHotEncoder().fit_transform(x_train)
|
||||||
|
model = DecisionTreeClassifier()
|
||||||
|
model.fit(encoded, y_train)
|
||||||
|
pred = model.predict(encoded)
|
||||||
|
|
||||||
|
k = 100
|
||||||
|
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||||
|
'native-country']
|
||||||
|
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||||
|
'native-country']
|
||||||
|
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
|
||||||
|
anon = anonymizer.anonymize(x_train, pred)
|
||||||
|
|
||||||
|
assert(anon.drop_duplicates().shape[0] < x_train.drop_duplicates().shape[0])
|
||||||
|
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||||
|
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||||
|
|
||||||
|
|
||||||
|
def test_anonymize_pandas_nursery():
|
||||||
|
(x_train, y_train), _ = get_nursery_dataset()
|
||||||
|
x_train = x_train.astype(str)
|
||||||
|
encoded = OneHotEncoder().fit_transform(x_train)
|
||||||
|
model = DecisionTreeClassifier()
|
||||||
|
model.fit(encoded, y_train)
|
||||||
|
pred = model.predict(encoded)
|
||||||
|
|
||||||
|
k = 100
|
||||||
|
QI = ["finance", "social", "health"]
|
||||||
|
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
|
||||||
|
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
|
||||||
|
anon = anonymizer.anonymize(x_train, pred)
|
||||||
|
|
||||||
|
assert(anon.drop_duplicates().shape[0] < x_train.drop_duplicates().shape[0])
|
||||||
|
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||||
|
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||||
|
|
||||||
|
|
||||||
|
def test_errors():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Anonymize(1, [0, 2])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Anonymize(2, [])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Anonymize(2, None)
|
||||||
|
anonymizer = Anonymize(10, [0, 2])
|
||||||
|
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
anonymizer.anonymize(x_train, y_test)
|
||||||
|
(x_train, y_train), _ = get_adult_dataset()
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
anonymizer.anonymize(x_train, y_train)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue