Initial commit

This commit is contained in:
abigailt 2021-04-28 14:00:19 +03:00
parent d2de0726f4
commit 5665c2e79d
22 changed files with 2369 additions and 0 deletions

4
apt/__init__.py Normal file
View file

@ -0,0 +1,4 @@
from apt import anonymization
from apt import utils
__version__ = "0.0.1"

View file

@ -0,0 +1,22 @@
# anonymization module
This module contains methods for anonymizing ML model training data, so that when
a model is retrained on the anonymized data, the model itself will also be considered
anonymous. This may help exempt the model from different obligations and restrictions
set out in data protection regulations such as GDPR, CCPA, etc.
The module contains methods that enable anonymizing training datasets in a manner that
is tailored to and guided by an existing, trained ML model. It uses the existing model's
predictions on the training data to train a second, anonymizer model, that eventually determines
the generalizations that will be applied to the training data. For more information about the
method see: https://arxiv.org/abs/2007.13086
Once the anonymized training data is returned, it can be used to retrain the model.
The following figure depicts the overall process:
<p align="center">
<img src="../../docs/images/AI_Privacy_project2.jpg?raw=true" width="667" title="anonymization process">
</p>
<br />

View file

@ -0,0 +1,17 @@
"""
Module providing ML anonymization.
This module contains methods for anonymizing ML model training data, so that when
a model is retrained on the anonymized data, the model itself will also be considered
anonymous. This may help exempt the model from different obligations and restrictions
set out in data protection regulations such as GDPR, CCPA, etc.
The module contains methods that enable anonymizing training datasets in a manner that
is tailored to and guided by an existing, trained ML model. It uses the existing model's
predictions on the training data to train a second, anonymizer model, that eventually determines
the generalizations that will be applied to the training data. For more information about the
method see: https://arxiv.org/abs/2007.13086
Once the anonymized training data is returned, it can be used to retrain the model.
"""
from apt.anonymization.anonymizer import Anonymize

View file

@ -0,0 +1,165 @@
import numpy as np
import pandas as pd
from scipy.spatial import distance
from collections import Counter
from sklearn.tree import DecisionTreeClassifier
from typing import Union, Optional
class Anonymize:
"""
Class for performing tailored, model-guided anonymization of training datasets for ML models.
Based on the implementation described in: https://arxiv.org/abs/2007.13086
"""
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list]=None):
"""
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
:param quasi_identifiers: The indexes of the features that need to be anonymized (these should be the features
that may directly, indirectly or in combination with additional data, identify an
individual).
:param categorical_features: The list of categorical features (should only be supplied when passing data as a
pandas dataframe.
"""
if k < 2:
raise ValueError("k should be a positive integer with a value of 2 or higher")
if not quasi_identifiers or len(quasi_identifiers) < 1:
raise ValueError("The list of quasi-identifiers cannot be empty")
self.k = k
self.quasi_identifiers = quasi_identifiers
self.categorical_features = categorical_features
def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \
-> Union[np.ndarray, pd.DataFrame]:
"""
Method for performing model-guided anonymization.
:param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and
categorical data.
:param y: The predictions of the original model on the training data.
:return: An array containing the anonymized training dataset.
"""
if type(x) == np.ndarray:
return self._anonymize_ndarray(x.copy(), y)
else: # pandas
if not self.categorical_features:
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
return self._anonymize_pandas(x.copy(), y)
def _anonymize_ndarray(self, x, y):
if x.shape[0] != y.shape[0]:
raise ValueError("x and y should have same number of rows")
x_anonymizer_train = x[:, self.quasi_identifiers]
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self.anonymizer.fit(x_anonymizer_train, y)
cells_by_id = self._calculate_cells(x, x_anonymizer_train)
return self._anonymize_data_numpy(x, x_anonymizer_train, cells_by_id)
def _anonymize_pandas(self, x, y):
if x.shape[0] != y.shape[0]:
raise ValueError("x and y should have same number of rows")
x_anonymizer_train = x.loc[:, self.quasi_identifiers]
# need to one-hot encode before training the decision tree
x_prepared = self._modify_categorical_features(x_anonymizer_train)
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self.anonymizer.fit(x_prepared, y)
cells_by_id = self._calculate_cells(x, x_prepared)
return self._anonymize_data_pandas(x, x_prepared, cells_by_id)
def _calculate_cells(self, x, x_anonymizer_train):
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
cells_by_id = {}
leaves = []
for node, feature in enumerate(self.anonymizer.tree_.feature):
if feature == -2: # leaf node
leaves.append(node)
hist = [int(i) for i in self.anonymizer.tree_.value[node][0]]
label_hist = self.anonymizer.tree_.value[node][0]
label = int(self.anonymizer.classes_[np.argmax(label_hist)])
cell = {'label': label, 'hist': hist, 'id': int(node)}
cells_by_id[cell['id']] = cell
self.nodes = leaves
self._find_representatives(x, x_anonymizer_train, cells_by_id.values())
return cells_by_id
def _find_representatives(self, x, x_anonymizer_train, cells):
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
node_ids = self._find_sample_nodes(x_anonymizer_train)
for cell in cells:
cell['representative'] = {}
# get all rows in cell
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
# TODO: should we filter only those with majority label? (using hist)
if type(x) == np.ndarray:
rows = x[indexes]
else: # pandas
rows = x.iloc[indexes]
for feature in self.quasi_identifiers:
if type(x) == np.ndarray:
values = rows[:, feature]
else: # pandas
values = rows.loc[:, feature]
if self.categorical_features and feature in self.categorical_features:
# find most common value
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
else:
# find the mean value (per feature)
median = np.median(values)
min_value = max(values)
min_dist = float("inf")
for value in values:
dist = distance.euclidean(value, median)
if dist < min_dist:
min_dist = dist
min_value = value
cell['representative'][feature] = min_value
def _find_sample_nodes(self, samples):
paths = self.anonymizer.decision_path(samples).toarray()
node_set = set(self.nodes)
return [(list(set([i for i, v in enumerate(p) if v == 1]) & node_set))[0] for p in paths]
def _find_sample_cells(self, samples, cells_by_id):
node_ids = self._find_sample_nodes(samples)
return [cells_by_id[node_id] for node_id in node_ids]
def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
index = 0
for row in x:
cell = cells[index]
index += 1
for feature in cell['representative']:
row[feature] = cell['representative'][feature]
return x
def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id):
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
index = 0
for i, row in x.iterrows():
cell = cells[index]
index += 1
for feature in cell['representative']:
x.at[i, feature] = cell['representative'][feature]
return x
def _modify_categorical_features(self, x): # only for pandas
self.categorical_values = {}
self.one_hot_to_features = {}
features_to_remove = []
for feature in self.categorical_features:
if feature in self.quasi_identifiers:
all_values = x.loc[:, feature]
values = list(all_values.unique())
self.categorical_values[feature] = values
x[feature] = pd.Categorical(x.loc[:, feature], categories=values, ordered=False)
one_hot_vector = pd.get_dummies(x[feature], prefix=feature)
for one_hot_vector_feature in one_hot_vector.columns:
self.one_hot_to_features[one_hot_vector_feature] = feature
x = pd.concat([x, one_hot_vector], axis=1)
features_to_remove.append(feature)
return x.drop(features_to_remove, axis=1)

219
apt/utils.py Normal file
View file

@ -0,0 +1,219 @@
from sklearn import datasets, model_selection
import sklearn.preprocessing
import pandas as pd
import ssl
from os import path
from six.moves.urllib.request import urlretrieve
def _load_iris(test_set_size: float=0.3):
iris = datasets.load_iris()
data = iris.data
labels = iris.target
# Split training and test sets
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
random_state=18, stratify=labels,
shuffle=True)
return (x_train, y_train), (x_test, y_test)
def get_iris_dataset():
"""
Loads the Iris dataset from scikit-learn.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Entire dataset and labels as numpy array.
"""
return _load_iris()
def get_adult_dataset():
"""
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary.
:return: Dataset and labels as pandas dataframes.
"""
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
'label']
train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
train_file = '../datasets/adult/train'
test_file = '../datasets/adult/test'
ssl._create_default_https_context = ssl._create_unverified_context
if not path.exists(train_file):
urlretrieve(train_url, train_file)
if not path.exists(test_file):
urlretrieve(test_url, test_file)
train = pd.read_csv(train_file, sep=', ', names=features, engine='python')
test = pd.read_csv(test_file, sep=', ', names=features, engine='python')
test = test.iloc[1:]
train = _modify_adult_dataset(train)
test = _modify_adult_dataset(test)
x_train = train.drop(['label'], axis=1)
y_train = train.loc[:, 'label']
x_test = test.drop(['label'], axis=1)
y_test = test.loc[:, 'label']
return (x_train, y_train), (x_test, y_test)
def _modify_adult_dataset(data):
def modify_label(value):
if value == '<=50K.' or value == '<=50K':
return 0
elif value == '>50K.' or value == '>50K':
return 1
else:
raise Exception('Bad label value')
def modify_native_country(value):
Euro_1 = ['Italy', 'Holand-Netherlands', 'Germany', 'France']
Euro_2 = ['Yugoslavia', 'South', 'Portugal', 'Poland', 'Hungary', 'Greece']
SE_Asia = ['Vietnam', 'Thailand', 'Philippines', 'Laos', 'Cambodia']
UnitedStates = ['United-States']
LatinAmerica = ['Trinadad&Tobago', 'Puerto-Rico', 'Outlying-US(Guam-USVI-etc)', 'Nicaragua', 'Mexico',
'Jamaica', 'Honduras', 'Haiti', 'Guatemala', 'Dominican-Republic']
China = ['Taiwan', 'Hong', 'China']
BritishCommonwealth = ['Scotland', 'Ireland', 'India', 'England', 'Canada']
SouthAmerica = ['Peru', 'El-Salvador', 'Ecuador', 'Columbia']
Other = ['Japan', 'Iran', 'Cuba']
if value in Euro_1:
return 'Euro_1'
elif value in Euro_2:
return 'Euro_2'
elif value in SE_Asia:
return 'SE_Asia'
elif value in UnitedStates:
return 'UnitedStates'
elif value in LatinAmerica:
return 'LatinAmerica'
elif value in China:
return 'China'
elif value in BritishCommonwealth:
return 'BritishCommonwealth'
elif value in SouthAmerica:
return 'SouthAmerica'
elif value in Other:
return 'Other'
elif value == '?':
return 'Unknown'
else:
raise Exception('Bad native country value')
data['label'] = data['label'].apply(modify_label)
data['native-country'] = data['native-country'].apply(modify_native_country)
for col in ('age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'):
try:
data[col] = data[col].fillna(0)
except KeyError:
print('missing column ' + col)
for col in ('workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'):
try:
data[col] = data[col].fillna('NA')
except KeyError:
print('missing column ' + col)
return data.drop(['fnlwgt', 'education'], axis=1)
def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
"""
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it if necessary.
:param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot
encoded and data is scaled using sklearn's StandardScaler.
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1.
:param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute
inference. This is done by assigning the original value 'problematic' the new value 1, and
the other original values are assigned the new value 0.
:return: Dataset and labels as pandas dataframes.
"""
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data'
data_file = '../datasets/nursery/data'
ssl._create_default_https_context = ssl._create_unverified_context
if not path.exists(data_file):
urlretrieve(url, data_file)
# load data
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health", "label"]
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]
data = pd.read_csv(data_file, sep=",", names=features, engine="python")
# remove rows with missing label or too sparse label
data = data.dropna(subset=["label"])
data.drop(data.loc[data["label"] == "recommend"].index, axis=0, inplace=True)
# fill missing values
data["children"] = data["children"].fillna(0)
for col in ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]:
data[col] = data[col].fillna("other")
# make categorical label
def modify_label(value): # 5 classes
if value == "not_recom":
return 0
elif value == "very_recom":
return 1
elif value == "priority":
return 2
elif value == "spec_prior":
return 3
else:
raise Exception("Bad label value: %s" % value)
data["label"] = data["label"].apply(modify_label)
data["children"] = data["children"].apply(lambda x: 4 if x == "more" else x)
if transform_social:
def modify_social(value):
if value == "problematic":
return 1
else:
return 0
data["social"] = data["social"].apply(modify_social)
categorical_features.remove("social")
if not raw:
# one-hot-encode categorical features
features_to_remove = []
for feature in categorical_features:
all_values = data.loc[:, feature]
values = list(all_values.unique())
data[feature] = pd.Categorical(data.loc[:, feature], categories=values, ordered=False)
one_hot_vector = pd.get_dummies(data[feature], prefix=feature)
data = pd.concat([data, one_hot_vector], axis=1)
features_to_remove.append(feature)
data = data.drop(features_to_remove, axis=1)
# normalize data
label = data.loc[:, "label"]
features = data.drop(["label"], axis=1)
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(features)
scaled_features = pd.DataFrame(scaler.transform(features), columns=features.columns)
data = pd.concat([label, scaled_features], axis=1, join="inner")
# Split training and test sets
stratified = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_set, random_state=18)
for train_set, test_set in stratified.split(data, data["label"]):
train = data.iloc[train_set]
test = data.iloc[test_set]
x_train = train.drop(["label"], axis=1)
y_train = train.loc[:, "label"]
x_test = test.drop(["label"], axis=1)
y_test = test.loc[:, "label"]
return (x_train, y_train), (x_test, y_test)