mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-23 15:48:06 +02:00
Initial commit
This commit is contained in:
parent
d2de0726f4
commit
5665c2e79d
22 changed files with 2369 additions and 0 deletions
4
apt/__init__.py
Normal file
4
apt/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
from apt import anonymization
|
||||
from apt import utils
|
||||
|
||||
__version__ = "0.0.1"
|
||||
22
apt/anonymization/README.md
Normal file
22
apt/anonymization/README.md
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
# anonymization module
|
||||
This module contains methods for anonymizing ML model training data, so that when
|
||||
a model is retrained on the anonymized data, the model itself will also be considered
|
||||
anonymous. This may help exempt the model from different obligations and restrictions
|
||||
set out in data protection regulations such as GDPR, CCPA, etc.
|
||||
|
||||
The module contains methods that enable anonymizing training datasets in a manner that
|
||||
is tailored to and guided by an existing, trained ML model. It uses the existing model's
|
||||
predictions on the training data to train a second, anonymizer model, that eventually determines
|
||||
the generalizations that will be applied to the training data. For more information about the
|
||||
method see: https://arxiv.org/abs/2007.13086
|
||||
|
||||
Once the anonymized training data is returned, it can be used to retrain the model.
|
||||
|
||||
The following figure depicts the overall process:
|
||||
|
||||
<p align="center">
|
||||
<img src="../../docs/images/AI_Privacy_project2.jpg?raw=true" width="667" title="anonymization process">
|
||||
</p>
|
||||
<br />
|
||||
|
||||
|
||||
17
apt/anonymization/__init__.py
Normal file
17
apt/anonymization/__init__.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
"""
|
||||
Module providing ML anonymization.
|
||||
|
||||
This module contains methods for anonymizing ML model training data, so that when
|
||||
a model is retrained on the anonymized data, the model itself will also be considered
|
||||
anonymous. This may help exempt the model from different obligations and restrictions
|
||||
set out in data protection regulations such as GDPR, CCPA, etc.
|
||||
|
||||
The module contains methods that enable anonymizing training datasets in a manner that
|
||||
is tailored to and guided by an existing, trained ML model. It uses the existing model's
|
||||
predictions on the training data to train a second, anonymizer model, that eventually determines
|
||||
the generalizations that will be applied to the training data. For more information about the
|
||||
method see: https://arxiv.org/abs/2007.13086
|
||||
|
||||
Once the anonymized training data is returned, it can be used to retrain the model.
|
||||
"""
|
||||
from apt.anonymization.anonymizer import Anonymize
|
||||
165
apt/anonymization/anonymizer.py
Normal file
165
apt/anonymization/anonymizer.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.spatial import distance
|
||||
from collections import Counter
|
||||
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
from typing import Union, Optional
|
||||
|
||||
|
||||
class Anonymize:
|
||||
"""
|
||||
Class for performing tailored, model-guided anonymization of training datasets for ML models.
|
||||
|
||||
Based on the implementation described in: https://arxiv.org/abs/2007.13086
|
||||
"""
|
||||
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list]=None):
|
||||
"""
|
||||
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
|
||||
other (when looking at the quasi identifiers). Should be at least 2.
|
||||
:param quasi_identifiers: The indexes of the features that need to be anonymized (these should be the features
|
||||
that may directly, indirectly or in combination with additional data, identify an
|
||||
individual).
|
||||
:param categorical_features: The list of categorical features (should only be supplied when passing data as a
|
||||
pandas dataframe.
|
||||
"""
|
||||
if k < 2:
|
||||
raise ValueError("k should be a positive integer with a value of 2 or higher")
|
||||
if not quasi_identifiers or len(quasi_identifiers) < 1:
|
||||
raise ValueError("The list of quasi-identifiers cannot be empty")
|
||||
|
||||
self.k = k
|
||||
self.quasi_identifiers = quasi_identifiers
|
||||
self.categorical_features = categorical_features
|
||||
|
||||
def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \
|
||||
-> Union[np.ndarray, pd.DataFrame]:
|
||||
"""
|
||||
Method for performing model-guided anonymization.
|
||||
|
||||
:param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and
|
||||
categorical data.
|
||||
:param y: The predictions of the original model on the training data.
|
||||
:return: An array containing the anonymized training dataset.
|
||||
"""
|
||||
if type(x) == np.ndarray:
|
||||
return self._anonymize_ndarray(x.copy(), y)
|
||||
else: # pandas
|
||||
if not self.categorical_features:
|
||||
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
|
||||
return self._anonymize_pandas(x.copy(), y)
|
||||
|
||||
def _anonymize_ndarray(self, x, y):
|
||||
if x.shape[0] != y.shape[0]:
|
||||
raise ValueError("x and y should have same number of rows")
|
||||
x_anonymizer_train = x[:, self.quasi_identifiers]
|
||||
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||
self.anonymizer.fit(x_anonymizer_train, y)
|
||||
cells_by_id = self._calculate_cells(x, x_anonymizer_train)
|
||||
return self._anonymize_data_numpy(x, x_anonymizer_train, cells_by_id)
|
||||
|
||||
def _anonymize_pandas(self, x, y):
|
||||
if x.shape[0] != y.shape[0]:
|
||||
raise ValueError("x and y should have same number of rows")
|
||||
x_anonymizer_train = x.loc[:, self.quasi_identifiers]
|
||||
# need to one-hot encode before training the decision tree
|
||||
x_prepared = self._modify_categorical_features(x_anonymizer_train)
|
||||
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||
self.anonymizer.fit(x_prepared, y)
|
||||
cells_by_id = self._calculate_cells(x, x_prepared)
|
||||
return self._anonymize_data_pandas(x, x_prepared, cells_by_id)
|
||||
|
||||
def _calculate_cells(self, x, x_anonymizer_train):
|
||||
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
|
||||
cells_by_id = {}
|
||||
leaves = []
|
||||
for node, feature in enumerate(self.anonymizer.tree_.feature):
|
||||
if feature == -2: # leaf node
|
||||
leaves.append(node)
|
||||
hist = [int(i) for i in self.anonymizer.tree_.value[node][0]]
|
||||
label_hist = self.anonymizer.tree_.value[node][0]
|
||||
label = int(self.anonymizer.classes_[np.argmax(label_hist)])
|
||||
cell = {'label': label, 'hist': hist, 'id': int(node)}
|
||||
cells_by_id[cell['id']] = cell
|
||||
self.nodes = leaves
|
||||
self._find_representatives(x, x_anonymizer_train, cells_by_id.values())
|
||||
return cells_by_id
|
||||
|
||||
def _find_representatives(self, x, x_anonymizer_train, cells):
|
||||
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
|
||||
node_ids = self._find_sample_nodes(x_anonymizer_train)
|
||||
for cell in cells:
|
||||
cell['representative'] = {}
|
||||
# get all rows in cell
|
||||
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
|
||||
# TODO: should we filter only those with majority label? (using hist)
|
||||
if type(x) == np.ndarray:
|
||||
rows = x[indexes]
|
||||
else: # pandas
|
||||
rows = x.iloc[indexes]
|
||||
for feature in self.quasi_identifiers:
|
||||
if type(x) == np.ndarray:
|
||||
values = rows[:, feature]
|
||||
else: # pandas
|
||||
values = rows.loc[:, feature]
|
||||
if self.categorical_features and feature in self.categorical_features:
|
||||
# find most common value
|
||||
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
|
||||
else:
|
||||
# find the mean value (per feature)
|
||||
median = np.median(values)
|
||||
min_value = max(values)
|
||||
min_dist = float("inf")
|
||||
for value in values:
|
||||
dist = distance.euclidean(value, median)
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
min_value = value
|
||||
cell['representative'][feature] = min_value
|
||||
|
||||
def _find_sample_nodes(self, samples):
|
||||
paths = self.anonymizer.decision_path(samples).toarray()
|
||||
node_set = set(self.nodes)
|
||||
return [(list(set([i for i, v in enumerate(p) if v == 1]) & node_set))[0] for p in paths]
|
||||
|
||||
def _find_sample_cells(self, samples, cells_by_id):
|
||||
node_ids = self._find_sample_nodes(samples)
|
||||
return [cells_by_id[node_id] for node_id in node_ids]
|
||||
|
||||
def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
|
||||
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
|
||||
index = 0
|
||||
for row in x:
|
||||
cell = cells[index]
|
||||
index += 1
|
||||
for feature in cell['representative']:
|
||||
row[feature] = cell['representative'][feature]
|
||||
return x
|
||||
|
||||
def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id):
|
||||
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
|
||||
index = 0
|
||||
for i, row in x.iterrows():
|
||||
cell = cells[index]
|
||||
index += 1
|
||||
for feature in cell['representative']:
|
||||
x.at[i, feature] = cell['representative'][feature]
|
||||
return x
|
||||
|
||||
def _modify_categorical_features(self, x): # only for pandas
|
||||
self.categorical_values = {}
|
||||
self.one_hot_to_features = {}
|
||||
features_to_remove = []
|
||||
for feature in self.categorical_features:
|
||||
if feature in self.quasi_identifiers:
|
||||
all_values = x.loc[:, feature]
|
||||
values = list(all_values.unique())
|
||||
self.categorical_values[feature] = values
|
||||
x[feature] = pd.Categorical(x.loc[:, feature], categories=values, ordered=False)
|
||||
one_hot_vector = pd.get_dummies(x[feature], prefix=feature)
|
||||
for one_hot_vector_feature in one_hot_vector.columns:
|
||||
self.one_hot_to_features[one_hot_vector_feature] = feature
|
||||
x = pd.concat([x, one_hot_vector], axis=1)
|
||||
features_to_remove.append(feature)
|
||||
return x.drop(features_to_remove, axis=1)
|
||||
219
apt/utils.py
Normal file
219
apt/utils.py
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
from sklearn import datasets, model_selection
|
||||
import sklearn.preprocessing
|
||||
import pandas as pd
|
||||
import ssl
|
||||
from os import path
|
||||
from six.moves.urllib.request import urlretrieve
|
||||
|
||||
|
||||
def _load_iris(test_set_size: float=0.3):
|
||||
iris = datasets.load_iris()
|
||||
data = iris.data
|
||||
labels = iris.target
|
||||
|
||||
# Split training and test sets
|
||||
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
|
||||
random_state=18, stratify=labels,
|
||||
shuffle=True)
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
def get_iris_dataset():
|
||||
"""
|
||||
Loads the Iris dataset from scikit-learn.
|
||||
|
||||
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
|
||||
:return: Entire dataset and labels as numpy array.
|
||||
"""
|
||||
return _load_iris()
|
||||
|
||||
|
||||
def get_adult_dataset():
|
||||
"""
|
||||
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary.
|
||||
|
||||
:return: Dataset and labels as pandas dataframes.
|
||||
"""
|
||||
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
|
||||
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
|
||||
'label']
|
||||
train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
|
||||
test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
|
||||
train_file = '../datasets/adult/train'
|
||||
test_file = '../datasets/adult/test'
|
||||
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
if not path.exists(train_file):
|
||||
urlretrieve(train_url, train_file)
|
||||
if not path.exists(test_file):
|
||||
urlretrieve(test_url, test_file)
|
||||
|
||||
train = pd.read_csv(train_file, sep=', ', names=features, engine='python')
|
||||
test = pd.read_csv(test_file, sep=', ', names=features, engine='python')
|
||||
test = test.iloc[1:]
|
||||
|
||||
train = _modify_adult_dataset(train)
|
||||
test = _modify_adult_dataset(test)
|
||||
|
||||
x_train = train.drop(['label'], axis=1)
|
||||
y_train = train.loc[:, 'label']
|
||||
x_test = test.drop(['label'], axis=1)
|
||||
y_test = test.loc[:, 'label']
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
def _modify_adult_dataset(data):
|
||||
def modify_label(value):
|
||||
if value == '<=50K.' or value == '<=50K':
|
||||
return 0
|
||||
elif value == '>50K.' or value == '>50K':
|
||||
return 1
|
||||
else:
|
||||
raise Exception('Bad label value')
|
||||
|
||||
def modify_native_country(value):
|
||||
Euro_1 = ['Italy', 'Holand-Netherlands', 'Germany', 'France']
|
||||
Euro_2 = ['Yugoslavia', 'South', 'Portugal', 'Poland', 'Hungary', 'Greece']
|
||||
SE_Asia = ['Vietnam', 'Thailand', 'Philippines', 'Laos', 'Cambodia']
|
||||
UnitedStates = ['United-States']
|
||||
LatinAmerica = ['Trinadad&Tobago', 'Puerto-Rico', 'Outlying-US(Guam-USVI-etc)', 'Nicaragua', 'Mexico',
|
||||
'Jamaica', 'Honduras', 'Haiti', 'Guatemala', 'Dominican-Republic']
|
||||
China = ['Taiwan', 'Hong', 'China']
|
||||
BritishCommonwealth = ['Scotland', 'Ireland', 'India', 'England', 'Canada']
|
||||
SouthAmerica = ['Peru', 'El-Salvador', 'Ecuador', 'Columbia']
|
||||
Other = ['Japan', 'Iran', 'Cuba']
|
||||
|
||||
if value in Euro_1:
|
||||
return 'Euro_1'
|
||||
elif value in Euro_2:
|
||||
return 'Euro_2'
|
||||
elif value in SE_Asia:
|
||||
return 'SE_Asia'
|
||||
elif value in UnitedStates:
|
||||
return 'UnitedStates'
|
||||
elif value in LatinAmerica:
|
||||
return 'LatinAmerica'
|
||||
elif value in China:
|
||||
return 'China'
|
||||
elif value in BritishCommonwealth:
|
||||
return 'BritishCommonwealth'
|
||||
elif value in SouthAmerica:
|
||||
return 'SouthAmerica'
|
||||
elif value in Other:
|
||||
return 'Other'
|
||||
elif value == '?':
|
||||
return 'Unknown'
|
||||
else:
|
||||
raise Exception('Bad native country value')
|
||||
|
||||
data['label'] = data['label'].apply(modify_label)
|
||||
data['native-country'] = data['native-country'].apply(modify_native_country)
|
||||
|
||||
for col in ('age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'):
|
||||
try:
|
||||
data[col] = data[col].fillna(0)
|
||||
except KeyError:
|
||||
print('missing column ' + col)
|
||||
|
||||
for col in ('workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'):
|
||||
try:
|
||||
data[col] = data[col].fillna('NA')
|
||||
except KeyError:
|
||||
print('missing column ' + col)
|
||||
|
||||
return data.drop(['fnlwgt', 'education'], axis=1)
|
||||
|
||||
|
||||
def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
|
||||
"""
|
||||
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it if necessary.
|
||||
|
||||
:param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot
|
||||
encoded and data is scaled using sklearn's StandardScaler.
|
||||
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1.
|
||||
:param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute
|
||||
inference. This is done by assigning the original value 'problematic' the new value 1, and
|
||||
the other original values are assigned the new value 0.
|
||||
:return: Dataset and labels as pandas dataframes.
|
||||
"""
|
||||
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data'
|
||||
data_file = '../datasets/nursery/data'
|
||||
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
if not path.exists(data_file):
|
||||
urlretrieve(url, data_file)
|
||||
|
||||
# load data
|
||||
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health", "label"]
|
||||
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]
|
||||
data = pd.read_csv(data_file, sep=",", names=features, engine="python")
|
||||
# remove rows with missing label or too sparse label
|
||||
data = data.dropna(subset=["label"])
|
||||
data.drop(data.loc[data["label"] == "recommend"].index, axis=0, inplace=True)
|
||||
|
||||
# fill missing values
|
||||
data["children"] = data["children"].fillna(0)
|
||||
|
||||
for col in ["parents", "has_nurs", "form", "housing", "finance", "social", "health"]:
|
||||
data[col] = data[col].fillna("other")
|
||||
|
||||
# make categorical label
|
||||
def modify_label(value): # 5 classes
|
||||
if value == "not_recom":
|
||||
return 0
|
||||
elif value == "very_recom":
|
||||
return 1
|
||||
elif value == "priority":
|
||||
return 2
|
||||
elif value == "spec_prior":
|
||||
return 3
|
||||
else:
|
||||
raise Exception("Bad label value: %s" % value)
|
||||
|
||||
data["label"] = data["label"].apply(modify_label)
|
||||
data["children"] = data["children"].apply(lambda x: 4 if x == "more" else x)
|
||||
|
||||
if transform_social:
|
||||
|
||||
def modify_social(value):
|
||||
if value == "problematic":
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
data["social"] = data["social"].apply(modify_social)
|
||||
categorical_features.remove("social")
|
||||
|
||||
if not raw:
|
||||
# one-hot-encode categorical features
|
||||
features_to_remove = []
|
||||
for feature in categorical_features:
|
||||
all_values = data.loc[:, feature]
|
||||
values = list(all_values.unique())
|
||||
data[feature] = pd.Categorical(data.loc[:, feature], categories=values, ordered=False)
|
||||
one_hot_vector = pd.get_dummies(data[feature], prefix=feature)
|
||||
data = pd.concat([data, one_hot_vector], axis=1)
|
||||
features_to_remove.append(feature)
|
||||
data = data.drop(features_to_remove, axis=1)
|
||||
|
||||
# normalize data
|
||||
label = data.loc[:, "label"]
|
||||
features = data.drop(["label"], axis=1)
|
||||
scaler = sklearn.preprocessing.StandardScaler()
|
||||
scaler.fit(features)
|
||||
scaled_features = pd.DataFrame(scaler.transform(features), columns=features.columns)
|
||||
data = pd.concat([label, scaled_features], axis=1, join="inner")
|
||||
|
||||
# Split training and test sets
|
||||
stratified = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_set, random_state=18)
|
||||
for train_set, test_set in stratified.split(data, data["label"]):
|
||||
train = data.iloc[train_set]
|
||||
test = data.iloc[test_set]
|
||||
x_train = train.drop(["label"], axis=1)
|
||||
y_train = train.loc[:, "label"]
|
||||
x_test = test.drop(["label"], axis=1)
|
||||
y_test = test.loc[:, "label"]
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
Loading…
Add table
Add a link
Reference in a new issue