Squashed commit of the following:

commit d53818644e
Author: olasaadi <92303887+olasaadi@users.noreply.github.com>
Date:   Mon Mar 7 20:12:55 2022 +0200

    Build the dt on all features anon (#23)

    * add param to build the DT on all features and not just on QI
    * one-hot encoding only for categorical features

commit c47819a031
Author: abigailt <abigailt@il.ibm.com>
Date:   Wed Feb 23 19:40:11 2022 +0200

    Update docs

commit 7e2ce7fe96
Merge: 7fbd1e4 752871d
Author: abigailt <abigailt@il.ibm.com>
Date:   Wed Feb 23 19:26:44 2022 +0200

    Merge remote-tracking branch 'origin/main' into main

commit 7fbd1e4b90
Author: abigailt <abigailt@il.ibm.com>
Date:   Wed Feb 23 19:22:54 2022 +0200

    Update version and docs

commit 752871dd0c
Author: olasaadi <92303887+olasaadi@users.noreply.github.com>
Date:   Wed Feb 23 14:57:12 2022 +0200

    add minimization notebook (#22)

    * add german credit notebook to showcase new features (minimize only some features and categorical features)

    * add notebook to show minimization data on a regression problem
This commit is contained in:
abigailt 2022-04-25 17:39:30 +03:00
parent fb2413c4aa
commit a37ff06df8
12 changed files with 753 additions and 69 deletions

View file

@ -3,6 +3,9 @@ import pandas as pd
from scipy.spatial import distance
from collections import Counter
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
@ -15,28 +18,38 @@ class Anonymize:
Class for performing tailored, model-guided anonymization of training datasets for ML models.
Based on the implementation described in: https://arxiv.org/abs/2007.13086
Parameters
----------
k : int
The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
quasi_identifiers : np.ndarray or list
The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
categorical_features : list, optional
The list of categorical features (should only be supplied when passing data as a
pandas dataframe.
is_regression : Bool, optional
Whether the model is a regression model or not (if False, assumes
a classification model). Default is False.
train_only_QI : Bool, optional
The required method to train data set for anonymization. Default is
to train the tree on all features.
"""
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
is_regression=False):
"""
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
:param quasi_identifiers: The features that need to be minimized. It can be a list of feature names (strings) if
dataset.feature_names is set, otherwise a list of indexes (integers).
:param categorical_features: The list of categorical features. It can be a list of feature names (strings) if
dataset.feature_names is set, otherwise a list of indexes (integers).
:param is_regression: Boolean param indicates that is is a regression problem.
"""
is_regression=False, train_only_QI=False):
if k < 2:
raise ValueError("k should be a positive integer with a value of 2 or higher")
if quasi_identifiers is None or len(quasi_identifiers) < 1:
raise ValueError("The list of quasi-identifiers cannot be empty")
self.k = k
self.quasi_identifiers = quasi_identifiers
self.categorical_features = categorical_features
self.is_regression = is_regression
self.features_names = None
self.train_only_QI = train_only_QI
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
"""
@ -72,7 +85,10 @@ class Anonymize:
def _anonymize(self, x, y):
if x.shape[0] != y.shape[0]:
raise ValueError("x and y should have same number of rows")
x_anonymizer_train = x[:, self.quasi_identifiers]
x_anonymizer_train = x
if self.train_only_QI:
# build DT just on QI features
x_anonymizer_train = x[:, self.quasi_identifiers]
if x.dtype.kind not in 'iufc':
if not self.categorical_features:
raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
@ -151,6 +167,21 @@ class Anonymize:
return x
def _modify_categorical_features(self, x):
encoder = OneHotEncoder()
one_hot_encoded = encoder.fit_transform(x)
return one_hot_encoded
# prepare data for DT
used_features = self.features
if self.train_only_QI:
used_features = self.quasi_identifiers
numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features]
categorical_features = [f for f in self.categorical_features if f in used_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(x)
return encoded