mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-05-07 11:02:37 +02:00
Squashed commit of the following:
commitd53818644eAuthor: olasaadi <92303887+olasaadi@users.noreply.github.com> Date: Mon Mar 7 20:12:55 2022 +0200 Build the dt on all features anon (#23) * add param to build the DT on all features and not just on QI * one-hot encoding only for categorical features commitc47819a031Author: abigailt <abigailt@il.ibm.com> Date: Wed Feb 23 19:40:11 2022 +0200 Update docs commit7e2ce7fe96Merge:7fbd1e4752871dAuthor: abigailt <abigailt@il.ibm.com> Date: Wed Feb 23 19:26:44 2022 +0200 Merge remote-tracking branch 'origin/main' into main commit7fbd1e4b90Author: abigailt <abigailt@il.ibm.com> Date: Wed Feb 23 19:22:54 2022 +0200 Update version and docs commit752871dd0cAuthor: olasaadi <92303887+olasaadi@users.noreply.github.com> Date: Wed Feb 23 14:57:12 2022 +0200 add minimization notebook (#22) * add german credit notebook to showcase new features (minimize only some features and categorical features) * add notebook to show minimization data on a regression problem
This commit is contained in:
parent
fb2413c4aa
commit
a37ff06df8
12 changed files with 753 additions and 69 deletions
|
|
@ -3,6 +3,9 @@ import pandas as pd
|
|||
from scipy.spatial import distance
|
||||
from collections import Counter
|
||||
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
|
||||
|
|
@ -15,28 +18,38 @@ class Anonymize:
|
|||
Class for performing tailored, model-guided anonymization of training datasets for ML models.
|
||||
|
||||
Based on the implementation described in: https://arxiv.org/abs/2007.13086
|
||||
Parameters
|
||||
----------
|
||||
k : int
|
||||
The privacy parameter that determines the number of records that will be indistinguishable from each
|
||||
other (when looking at the quasi identifiers). Should be at least 2.
|
||||
quasi_identifiers : np.ndarray or list
|
||||
The features that need to be minimized in case of pandas data, and indexes of features
|
||||
in case of numpy data.
|
||||
categorical_features : list, optional
|
||||
The list of categorical features (should only be supplied when passing data as a
|
||||
pandas dataframe.
|
||||
is_regression : Bool, optional
|
||||
Whether the model is a regression model or not (if False, assumes
|
||||
a classification model). Default is False.
|
||||
train_only_QI : Bool, optional
|
||||
The required method to train data set for anonymization. Default is
|
||||
to train the tree on all features.
|
||||
"""
|
||||
|
||||
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
|
||||
is_regression=False):
|
||||
"""
|
||||
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
|
||||
other (when looking at the quasi identifiers). Should be at least 2.
|
||||
:param quasi_identifiers: The features that need to be minimized. It can be a list of feature names (strings) if
|
||||
dataset.feature_names is set, otherwise a list of indexes (integers).
|
||||
:param categorical_features: The list of categorical features. It can be a list of feature names (strings) if
|
||||
dataset.feature_names is set, otherwise a list of indexes (integers).
|
||||
:param is_regression: Boolean param indicates that is is a regression problem.
|
||||
"""
|
||||
is_regression=False, train_only_QI=False):
|
||||
if k < 2:
|
||||
raise ValueError("k should be a positive integer with a value of 2 or higher")
|
||||
if quasi_identifiers is None or len(quasi_identifiers) < 1:
|
||||
raise ValueError("The list of quasi-identifiers cannot be empty")
|
||||
|
||||
self.k = k
|
||||
self.quasi_identifiers = quasi_identifiers
|
||||
self.categorical_features = categorical_features
|
||||
self.is_regression = is_regression
|
||||
self.features_names = None
|
||||
self.train_only_QI = train_only_QI
|
||||
|
||||
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
|
||||
"""
|
||||
|
|
@ -72,7 +85,10 @@ class Anonymize:
|
|||
def _anonymize(self, x, y):
|
||||
if x.shape[0] != y.shape[0]:
|
||||
raise ValueError("x and y should have same number of rows")
|
||||
x_anonymizer_train = x[:, self.quasi_identifiers]
|
||||
x_anonymizer_train = x
|
||||
if self.train_only_QI:
|
||||
# build DT just on QI features
|
||||
x_anonymizer_train = x[:, self.quasi_identifiers]
|
||||
if x.dtype.kind not in 'iufc':
|
||||
if not self.categorical_features:
|
||||
raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
|
||||
|
|
@ -151,6 +167,21 @@ class Anonymize:
|
|||
return x
|
||||
|
||||
def _modify_categorical_features(self, x):
|
||||
encoder = OneHotEncoder()
|
||||
one_hot_encoded = encoder.fit_transform(x)
|
||||
return one_hot_encoded
|
||||
# prepare data for DT
|
||||
used_features = self.features
|
||||
if self.train_only_QI:
|
||||
used_features = self.quasi_identifiers
|
||||
numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features]
|
||||
categorical_features = [f for f in self.categorical_features if f in used_features]
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
)
|
||||
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
("num", numeric_transformer, numeric_features),
|
||||
("cat", categorical_transformer, categorical_features),
|
||||
]
|
||||
)
|
||||
encoded = preprocessor.fit_transform(x)
|
||||
return encoded
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue