Merge with main

This commit is contained in:
abigailt 2022-08-01 18:11:34 +03:00
commit dc5cc793ee
30 changed files with 2819 additions and 1066 deletions

160
.gitignore vendored Normal file
View file

@ -0,0 +1,160 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

View file

@ -6,4 +6,4 @@ from apt import anonymization
from apt import minimization
from apt import utils
__version__ = "0.0.4"
__version__ = "0.1.0"

View file

@ -18,27 +18,26 @@ class Anonymize:
Class for performing tailored, model-guided anonymization of training datasets for ML models.
Based on the implementation described in: https://arxiv.org/abs/2007.13086
Parameters
----------
k : int
The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
quasi_identifiers : np.ndarray or list
The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
categorical_features : list, optional
The list of categorical features (should only be supplied when passing data as a
pandas dataframe.
is_regression : Bool, optional
Whether the model is a regression model or not (if False, assumes
a classification model). Default is False.
train_only_QI : Bool, optional
The required method to train data set for anonymization. Default is
to train the tree on all features.
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
:type k: int
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
:type quasi_identifiers: np.ndarray or list
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
before using them to train the decision tree model).
:type categorical_features: list, optional
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
Default is False.
:type is_regression: list, optional
:param train_only_QI: The required method to train data set for anonymization. Default is
to train the tree on all features.
:type train_only_QI: boolean, optional
"""
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
is_regression=False, train_only_QI=False):
is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
if k < 2:
raise ValueError("k should be a positive integer with a value of 2 or higher")
if quasi_identifiers is None or len(quasi_identifiers) < 1:
@ -58,7 +57,9 @@ class Anonymize:
:param dataset: Data wrapper containing the training data for the model and the predictions of the
original model on the training data.
:return: An array containing the anonymized training dataset.
:type dataset: `ArrayDataset`
:return: The anonymized training dataset as either numpy array or pandas DataFrame (depending on the type of
the original data used to create the ArrayDataset).
"""
if dataset.get_samples().shape[1] != 0:
self.features = [i for i in range(dataset.get_samples().shape[1])]
@ -100,11 +101,11 @@ class Anonymize:
# build DT just on QI features
x_anonymizer_train = x_prepared[:, self.quasi_identifiers]
if self.is_regression:
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self._anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
else:
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self._anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self.anonymizer.fit(x_anonymizer_train, y)
self._anonymizer.fit(x_anonymizer_train, y)
cells_by_id = self._calculate_cells(x, x_anonymizer_train)
return self._anonymize_data(x, x_anonymizer_train, cells_by_id)
@ -112,16 +113,16 @@ class Anonymize:
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
cells_by_id = {}
leaves = []
for node, feature in enumerate(self.anonymizer.tree_.feature):
for node, feature in enumerate(self._anonymizer.tree_.feature):
if feature == -2: # leaf node
leaves.append(node)
hist = [int(i) for i in self.anonymizer.tree_.value[node][0]]
hist = [int(i) for i in self._anonymizer.tree_.value[node][0]]
# TODO we may change the method for choosing representative for cell
# label_hist = self.anonymizer.tree_.value[node][0]
# label = int(self.anonymizer.classes_[np.argmax(label_hist)])
cell = {'label': 1, 'hist': hist, 'id': int(node)}
cells_by_id[cell['id']] = cell
self.nodes = leaves
self._nodes = leaves
self._find_representatives(x, x_anonymizer_train, cells_by_id.values())
return cells_by_id
@ -152,8 +153,8 @@ class Anonymize:
cell['representative'][feature] = min_value
def _find_sample_nodes(self, samples):
paths = self.anonymizer.decision_path(samples).toarray()
node_set = set(self.nodes)
paths = self._anonymizer.decision_path(samples).toarray()
node_set = set(self._nodes)
return [(list(set([i for i, v in enumerate(p) if v == 1]) & node_set))[0] for p in paths]
def _find_sample_cells(self, samples, cells_by_id):

View file

@ -11,8 +11,8 @@ from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.utils.validation import check_is_fitted
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
@ -21,7 +21,8 @@ from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnCl
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
""" A transformer that generalizes data to representative points.
"""
A transformer that generalizes data to representative points.
Learns data generalizations based on an original model's predictions
and a target accuracy. Once the generalizations are learned, can
@ -34,130 +35,142 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
need to supply an existing ``estimator`` to init.
In summary, either ``estimator`` and ``target_accuracy`` should be
supplied or ``cells`` should be supplied.
Parameters
----------
estimator : estimator, optional
The original model for which generalization is being performed.
Should be pre-fitted.
target_accuracy : float, optional
The required accuracy when applying the base model to the
generalized data. Accuracy is measured relative to the original
accuracy of the model.
categorical_features: list of str, optional
The list of categorical features should only be supplied when
passing data as a pandas dataframe.
features_to_minimize: List of str or numbers, optional
The features that need to be minimized in case of pandas data,
and indexes of features in case of numpy data.
cells : list of object, optional
The cells used to generalize records. Each cell must define a
range or subset of categories for each feature, as well as a
representative value for each feature.
This parameter should be used when instantiating a transformer
object without first fitting it.
train_only_QI : Bool, optional
The required method to train data set for minimizing. Default is
to train the tree just on the features that are given as
features_to_minimize.
is_regression : Bool, optional
Whether the model is a regression model or not (if False, assumes
a classification model). Default is False.
Attributes
----------
features_ : list of str
The feature names, in the order that they appear in the data.
cells_ : list of object
The cells used to generalize records, as learned when calling fit.
ncp_ : float
The NCP (information loss) score of the resulting generalization,
as measured on the training data.
generalizations_ : object
The generalizations that were learned (actual feature ranges).
:param estimator: The original model for which generalization is being performed. Should be pre-fitted.
:type estimator: sklearn `BaseEstimator` or `Model`
:param target_accuracy: The required relative accuracy when applying the base model to the generalized data.
Accuracy is measured relative to the original accuracy of the model.
:type target_accuracy: float, optional
:param cells: The cells used to generalize records. Each cell must define a range or subset of categories for
each feature, as well as a representative value for each feature. This parameter should be used
when instantiating a transformer object without first fitting it.
:type cells: list of objects, optional
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
encoded before using them to train the decision tree model).
:param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
features)
:type encoder: sklearn OrdinalEncoder or OneHotEncoder
:type categorical_features: list of strings, optional
:param features_to_minimize: The features to be minimized.
:type features_to_minimize: list of strings or int, optional
:param train_only_features_to_minimize: Whether to train the tree just on the ``features_to_minimize`` or on all
features. Default is only on ``features_to_minimize``.
:type train_only_features_to_minimize: boolean, optional
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
Default is False.
:type is_regression: boolean, optional
"""
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998,
cells: list = None, categorical_features: Union[np.ndarray, list] = None,
features_to_minimize: Union[np.ndarray, list] = None, train_only_QI: bool = True,
is_regression: bool = False):
if issubclass(estimator.__class__, Model):
self.estimator = estimator
else:
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998,
cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None,
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
train_only_features_to_minimize: Optional[bool] = True,
is_regression: Optional[bool] = False):
self.estimator = estimator
if estimator is not None and not issubclass(estimator.__class__, Model):
if is_regression:
self.estimator = SklearnRegressor(estimator)
else:
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_VECTOR)
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
self.target_accuracy = target_accuracy
self.cells = cells
self.categorical_features = []
if categorical_features:
self.categorical_features = categorical_features
self.features_to_minimize = features_to_minimize
self.train_only_QI = train_only_QI
self.train_only_features_to_minimize = train_only_features_to_minimize
self.is_regression = is_regression
self.encoder = encoder
def get_params(self, deep=True):
"""Get parameters for this estimator.
"""
Get parameters
Parameters
----------
deep : boolean, optional
If True, will return the parameters for this estimator and contained
subobjects that are estimators.
Returns
-------
params : mapping of string to any
Parameter names mapped to their values.
:param deep: If True, will return the parameters for this estimator and contained
sub-objects that are estimators.
:type deep: boolean, optional
:return: Parameter names mapped to their values
"""
ret = {}
ret['target_accuracy'] = self.target_accuracy
ret['categorical_features'] = self.categorical_features
ret['features_to_minimize'] = self.features_to_minimize
ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
ret['is_regression'] = self.is_regression
if deep:
ret['cells'] = copy.deepcopy(self.cells)
ret['estimator'] = self.estimator
ret['encoder'] = self.encoder
else:
ret['cells'] = copy.copy(self.cells)
return ret
def set_params(self, **params):
"""Set the parameters of this estimator.
"""
Set parameters
Returns
-------
self : object
Returns self.
:param target_accuracy: The required relative accuracy when applying the base model to the generalized data.
Accuracy is measured relative to the original accuracy of the model.
:type target_accuracy: float, optional
:param cells: The cells used to generalize records. Each cell must define a range or subset of categories for
each feature, as well as a representative value for each feature. This parameter should be used
when instantiating a transformer object without first fitting it.
:type cells: list of objects, optional
:return: self
"""
if 'target_accuracy' in params:
self.target_accuracy = params['target_accuracy']
if 'categorical_features' in params:
self.categorical_features = params['categorical_features']
if 'features_to_minimize' in params:
self.features_to_minimize = params['features_to_minimize']
if 'train_only_features_to_minimize' in params:
self.train_only_features_to_minimize = params['train_only_features_to_minimize']
if 'is_regression' in params:
self.is_regression = params['is_regression']
if 'cells' in params:
self.cells = params['cells']
return self
@property
def generalizations(self):
return self.generalizations_
"""
Return the generalizations derived from the model and test data.
:return: generalizations object. Contains 3 sections: 'ranges' that contains ranges for numerical features,
'categories' that contains sub-groups of categories for categorical features, and
'untouched' that contains the features that could not be generalized.
"""
return self._generalizations
@property
def ncp(self):
"""
Return the NCP score of the generalizations.
:return: ncp score as float.
"""
return self._ncp
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional = None, dataset: Optional[ArrayDataset] = None):
"""Learns the generalizations based on training data, and applies them to the data.
features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
"""
Learns the generalizations based on training data, and applies them to the data.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
The training input samples.
y : array-like, shape (n_samples,), optional
The target values. An array of int.
This should contain the predictions of the original model on ``X``.
features_names : list of str, The feature names, in the order that they appear in the data,
provided just if X and y were provided (optional).
dataset : Data wrapper containing the training input samples and the predictions of the
original model on the training data.
Either X,y OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
:param y: The target values. This should contain the predictions of the original model on ``X``.
:type y: array-like, shape (n_samples,), optional
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
passing the data as ``X`` and ``y``
:type features_names: list of strings, optional
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
:type dataset: `ArrayDataset`, optional
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
"""
self.fit(X, y, features_names, dataset=dataset)
return self.transform(X, features_names, dataset=dataset)
@ -166,23 +179,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
features_names: Optional = None, dataset: ArrayDataset = None):
"""Learns the generalizations based on training data.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
The training input samples.
y : array-like, shape (n_samples,), optional
The target values. An array of int.
This should contain the predictions of the original model on ``X``.
features_names : list of str, The feature names, in the order that they appear in the data,
provided just if X and y were provided (optional).
dataset : Data wrapper containing the training input samples and the predictions of the
original model on the training data.
Either X,y OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
:param y: The target values. This should contain the predictions of the original model on ``X``.
:type y: array-like, shape (n_samples,), optional
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
passing the data as ``X`` and ``y``
:type features_names: list of strings, optional
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
:type dataset: `ArrayDataset`, optional
:return: self
"""
# take into account that estimator, X, y, cells, features may be None
@ -193,27 +200,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
dataset = ArrayDataset(X, y, features_names)
if dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
self.n_features_ = dataset.get_samples().shape[1]
self._n_features = dataset.get_samples().shape[1]
elif dataset and dataset.features_names:
self.n_features_ = len(dataset.features_names)
self._n_features = len(dataset.features_names)
else:
self.n_features_ = 0
self._n_features = 0
if dataset and dataset.features_names:
self._features = dataset.features_names
# if features is None, use numbers instead of names
elif self.n_features_ != 0:
self._features = [str(i) for i in range(self.n_features_)]
elif self._n_features != 0:
self._features = [str(i) for i in range(self._n_features)]
else:
self._features = None
if self.cells:
self.cells_ = self.cells
else:
self.cells_ = {}
self.categorical_values = {}
# Going to fit
# (currently not dealing with option to fit with only X and y and no estimator)
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
@ -227,7 +227,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# divide dataset into train and test
used_data = x
if self.train_only_QI:
if self.train_only_features_to_minimize:
used_data = x_QI
if self.is_regression:
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4, random_state=14)
@ -238,11 +238,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
X_train_QI = X_train.loc[:, self.features_to_minimize]
X_test_QI = X_test.loc[:, self.features_to_minimize]
used_X_train = X_train
if self.train_only_QI:
used_X_test = X_test
if self.train_only_features_to_minimize:
used_X_train = X_train_QI
used_X_test = X_test_QI
# collect feature data (such as min, max)
feature_data = {}
for feature in self._features:
if feature not in feature_data.keys():
@ -253,83 +254,55 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
fd['max'] = max(values)
fd['range'] = max(values) - min(values)
else:
fd['range'] = len(values)
fd['range'] = len(np.unique(values))
feature_data[feature] = fd
# prepare data for DT
categorical_features = [f for f in self._features if f in self.categorical_features and
f in self.features_to_minimize]
# default encoder in case none provided
if self.encoder is None:
numeric_features = [f for f in self._features if f not in self.categorical_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
self.encoder = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, self.categorical_features),
]
)
self.encoder.fit(x)
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
self.cells = []
self._categorical_values = {}
numeric_features = [f for f in self._features if f not in self.categorical_features and
f in self.features_to_minimize]
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor_QI_features = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
preprocessor_QI_features.fit(x_QI)
# preprocessor to fit data that have features not included in QI (to get accuracy)
numeric_features = [f for f in self._features if f not in self.categorical_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, self.categorical_features),
]
)
preprocessor.fit(x)
x_prepared = preprocessor.transform(X_train)
if self.train_only_QI:
x_prepared = preprocessor_QI_features.transform(X_train_QI)
self._preprocessor = preprocessor
self.cells_ = {}
if self.is_regression:
self.dt_ = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
self._dt = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
else:
self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
self._dt = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
self.dt_.fit(x_prepared, y_train)
self._modify_categorical_features(used_data)
x_prepared = pd.DataFrame(x_prepared, columns=self.categorical_data.columns)
# prepare data for DT
self._encode_categorical_features(used_data, save_mapping=True)
x_prepared = self._encode_categorical_features(used_X_train)
self._dt.fit(x_prepared, y_train)
x_prepared_test = self._encode_categorical_features(used_X_test)
self._calculate_cells()
self._modify_cells()
# features that are not from QI should not be part of generalizations
for feature in self._features:
if feature not in self.features_to_minimize:
self._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
nodes = self._get_nodes_level(0)
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
# self.cells_ currently holds the generalization created from the tree leaves
# self._cells currently holds the generalization created from the tree leaves
self._calculate_generalizations()
# apply generalizations to test data
x_prepared_test = preprocessor.transform(X_test)
if self.train_only_QI:
x_prepared_test = preprocessor_QI_features.transform(X_test_QI)
x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self.categorical_data.columns)
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
# check accuracy
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
print('Initial accuracy of model on generalized data, relative to original model predictions '
'(base generalization derived from tree, before improvements): %f' % accuracy)
@ -338,31 +311,34 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
print('Improving generalizations')
level = 1
while accuracy > self.target_accuracy:
try:
cells_previous_iter = self.cells_
generalization_prev_iter = self.generalizations_
cells_by_id_prev = self.cells_by_id_
nodes = self._get_nodes_level(level)
self._calculate_level_cells(level)
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
cells_previous_iter = self.cells
generalization_prev_iter = self._generalizations
cells_by_id_prev = self._cells_by_id
nodes = self._get_nodes_level(level)
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_,
self.cells_by_id_)
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
# if accuracy passed threshold roll back to previous iteration generalizations
if accuracy < self.target_accuracy:
self.cells_ = cells_previous_iter
self.generalizations_ = generalization_prev_iter
self.cells_by_id_ = cells_by_id_prev
break
else:
print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
level += 1
except Exception as e:
try:
self._calculate_level_cells(level)
except TypeError as e:
print(e)
break
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
# if accuracy passed threshold roll back to previous iteration generalizations
if accuracy < self.target_accuracy:
self.cells = cells_previous_iter
self._generalizations = generalization_prev_iter
self._cells_by_id = cells_by_id_prev
break
else:
print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
level += 1
# if accuracy below threshold, improve accuracy by removing features from generalization
elif accuracy < self.target_accuracy:
print('Improving accuracy')
@ -374,35 +350,32 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
break
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self.cells_ currently holds the chosen generalization based on target accuracy
# self._cells currently holds the chosen generalization based on target accuracy
# calculate iLoss
self.ncp_ = self._calculate_ncp(X_test, self.generalizations_, feature_data)
self._ncp = self._calculate_ncp(X_test, self._generalizations, feature_data)
# Return the transformer
return self
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None):
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
dataset: Optional[ArrayDataset] = None):
""" Transforms data records to representative points.
Parameters
----------
X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
may contain both numeric and categorical data.
The input samples.
features_names : list of str, The feature names, in the order that they appear in the data,
provided just if X was provided (optional).
dataset : Data wrapper containing the training input samples.
Either X OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
passing the data as ``X`` and ``y``
:type features_names: list of strings, optional
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X`` OR ``dataset`` need to be provided, not both.
:type dataset: `ArrayDataset`, optional
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
"""
# Check if fit has been called
@ -423,45 +396,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if dataset and dataset.get_samples() is not None:
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
if x.shape[1] != self.n_features_ and self.n_features_ != 0:
if x.shape[1] != self._n_features and self._n_features != 0:
raise ValueError('Shape of input is different from what was seen'
'in `fit`')
if not self._features:
self._features = [i for i in range(x.shape[1])]
representatives = pd.DataFrame(columns=self._features) # only columns
generalized = pd.DataFrame(x, columns=self._features, copy=True) # original data
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
all_indexes = []
for i in range(len(self.cells)):
indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)
all_indexes.append(indexes)
generalized = self._generalize_indexes(x, self.cells, all_indexes)
# iterate over cells (leaves in decision tree)
for i in range(len(self.cells_)):
# Copy the representatives from the cells into another data structure:
# iterate over features in test data
for feature in self._features:
# if feature has a representative value in the cell and should not
# be left untouched, take the representative value
if feature in self.cells_[i]['representative'] and \
('untouched' not in self.cells_[i]
or feature not in self.cells_[i]['untouched']):
representatives.loc[i, feature] = self.cells_[i]['representative'][feature]
# else, drop the feature (removes from representatives columns that
# do not have a representative value or should remain untouched)
elif feature in representatives.columns.tolist():
representatives = representatives.drop(feature, axis=1)
# get the indexes of all records that map to this cell
indexes = self._get_record_indexes_for_cell(x, self.cells_[i], mapped)
# replace the values in the representative columns with the representative
# values (leaves others untouched)
if indexes and not representatives.columns.empty:
if len(indexes) > 1:
replace = pd.concat([representatives.loc[i].to_frame().T] * len(indexes)).reset_index(drop=True)
else:
replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
replace.index = indexes
generalized.loc[indexes, representatives.columns] = replace
if dataset and dataset.is_pandas:
return generalized
elif isinstance(X, pd.DataFrame):
@ -491,29 +439,36 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
mapped.itemset(i, 1)
return True
def _modify_categorical_features(self, X):
self.categorical_values = {}
self.oneHotVectorFeaturesToFeatures = {}
def _encode_categorical_features(self, X, save_mapping=False):
if save_mapping:
self._categorical_values = {}
self._one_hot_vector_features_to_features = {}
features_to_remove = []
used_features = self._features
if self.train_only_QI:
if self.train_only_features_to_minimize:
used_features = self.features_to_minimize
for feature in self.categorical_features:
if feature in used_features:
try:
all_values = X.loc[:, feature]
values = list(all_values.unique())
self.categorical_values[feature] = values
X[feature] = pd.Categorical(X.loc[:, feature], categories=values, ordered=False)
if save_mapping:
self._categorical_values[feature] = values
X[feature] = pd.Categorical(X.loc[:, feature], categories=self._categorical_values[feature],
ordered=False)
ohe = pd.get_dummies(X[feature], prefix=feature)
for oneHotVectorFeature in ohe.columns:
self.oneHotVectorFeaturesToFeatures[oneHotVectorFeature] = feature
if save_mapping:
for one_hot_vector_feature in ohe.columns:
self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
X = pd.concat([X, ohe], axis=1)
features_to_remove.append(feature)
except KeyError:
print("feature " + feature + "not found in training data")
self.categorical_data = X.drop(features_to_remove, axis=1)
new_data = X.drop(features_to_remove, axis=1)
if save_mapping:
self._encoded_features = new_data.columns
return new_data
def _cell_contains_numeric(self, f, range, x):
i = self._features.index(f)
@ -538,24 +493,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return False
def _calculate_cells(self):
self.cells_by_id_ = {}
self.cells_ = self._calculate_cells_recursive(0)
self._cells_by_id = {}
self.cells = self._calculate_cells_recursive(0)
def _calculate_cells_recursive(self, node):
feature_index = self.dt_.tree_.feature[node]
feature_index = self._dt.tree_.feature[node]
if feature_index == -2:
# this is a leaf
# if it is a regression problem we do not use label
label = self._calculate_cell_label(node) if not self.is_regression else 1
hist = [int(i) for i in self.dt_.tree_.value[node][0]] if not self.is_regression else []
hist = [int(i) for i in self._dt.tree_.value[node][0]] if not self.is_regression else []
cell = {'label': label, 'hist': hist, 'ranges': {}, 'id': int(node)}
return [cell]
cells = []
feature = self.categorical_data.columns[feature_index]
threshold = self.dt_.tree_.threshold[node]
left_child = self.dt_.tree_.children_left[node]
right_child = self.dt_.tree_.children_right[node]
feature = self._encoded_features[feature_index]
threshold = self._dt.tree_.threshold[node]
left_child = self._dt.tree_.children_left[node]
right_child = self._dt.tree_.children_right[node]
left_child_cells = self._calculate_cells_recursive(left_child)
for cell in left_child_cells:
@ -564,7 +519,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if cell['ranges'][feature]['end'] is None:
cell['ranges'][feature]['end'] = threshold
cells.append(cell)
self.cells_by_id_[cell['id']] = cell
self._cells_by_id[cell['id']] = cell
right_child_cells = self._calculate_cells_recursive(right_child)
for cell in right_child_cells:
@ -573,26 +528,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if cell['ranges'][feature]['start'] is None:
cell['ranges'][feature]['start'] = threshold
cells.append(cell)
self.cells_by_id_[cell['id']] = cell
self._cells_by_id[cell['id']] = cell
return cells
def _calculate_cell_label(self, node):
label_hist = self.dt_.tree_.value[node][0]
return int(self.dt_.classes_[np.argmax(label_hist)])
label_hist = self._dt.tree_.value[node][0]
return int(self._dt.classes_[np.argmax(label_hist)])
def _modify_cells(self):
cells = []
features = self.categorical_data.columns
for cell in self.cells_:
features = self._encoded_features
for cell in self.cells:
new_cell = {'id': cell['id'], 'label': cell['label'], 'ranges': {}, 'categories': {}, 'hist': cell['hist'],
'representative': None}
'untouched': [], 'representative': None}
for feature in features:
if feature in self.oneHotVectorFeaturesToFeatures.keys():
if feature in self._one_hot_vector_features_to_features.keys():
# feature is categorical and should be mapped
categorical_feature = self.oneHotVectorFeaturesToFeatures[feature]
categorical_feature = self._one_hot_vector_features_to_features[feature]
if categorical_feature not in new_cell['categories'].keys():
new_cell['categories'][categorical_feature] = self.categorical_values[
new_cell['categories'][categorical_feature] = self._categorical_values[
categorical_feature].copy()
if feature in cell['ranges'].keys():
categorical_value = feature[len(categorical_feature) + 1:]
@ -609,11 +564,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
else:
new_cell['ranges'][feature] = {'start': None, 'end': None}
cells.append(new_cell)
self.cells_by_id_[new_cell['id']] = new_cell
self.cells_ = cells
self._cells_by_id[new_cell['id']] = new_cell
self.cells = cells
def _calculate_level_cells(self, level):
if level < 0 or level > self.dt_.get_depth():
if level < 0 or level > self._dt.get_depth():
raise TypeError("Illegal level %d' % level", level)
if level > 0:
@ -622,13 +577,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
nodes = self._get_nodes_level(level)
if nodes:
for node in nodes:
if self.dt_.tree_.feature[node] == -2: # leaf node
new_cell = self.cells_by_id_[node]
if self._dt.tree_.feature[node] == -2: # leaf node
new_cell = self._cells_by_id[node]
else:
left_child = self.dt_.tree_.children_left[node]
right_child = self.dt_.tree_.children_right[node]
left_cell = self.cells_by_id_[left_child]
right_cell = self.cells_by_id_[right_child]
left_child = self._dt.tree_.children_left[node]
right_child = self._dt.tree_.children_right[node]
left_cell = self._cells_by_id[left_child]
right_cell = self._cells_by_id[right_child]
new_cell = {'id': int(node), 'ranges': {}, 'categories': {}, 'untouched': [],
'label': None, 'representative': None}
for feature in left_cell['ranges'].keys():
@ -645,28 +600,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._calculate_level_cell_label(left_cell, right_cell, new_cell)
new_cells.append(new_cell)
new_cells_by_id[new_cell['id']] = new_cell
self.cells_ = new_cells
self.cells_by_id_ = new_cells_by_id
self.cells = new_cells
self._cells_by_id = new_cells_by_id
# else: nothing to do, stay with previous cells
def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
new_cell['hist'] = [x + y for x, y in
zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1
new_cell['label'] = int(self._dt.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1
def _get_nodes_level(self, level):
# level = distance from lowest leaf
node_depth = np.zeros(shape=self.dt_.tree_.node_count, dtype=np.int64)
is_leaves = np.zeros(shape=self.dt_.tree_.node_count, dtype=bool)
node_depth = np.zeros(shape=self._dt.tree_.node_count, dtype=np.int64)
is_leaves = np.zeros(shape=self._dt.tree_.node_count, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
# depth = distance from root
node_depth[node_id] = parent_depth + 1
if self.dt_.tree_.children_left[node_id] != self.dt_.tree_.children_right[node_id]:
stack.append((self.dt_.tree_.children_left[node_id], parent_depth + 1))
stack.append((self.dt_.tree_.children_right[node_id], parent_depth + 1))
if self._dt.tree_.children_left[node_id] != self._dt.tree_.children_right[node_id]:
stack.append((self._dt.tree_.children_left[node_id], parent_depth + 1))
stack.append((self._dt.tree_.children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
@ -685,7 +640,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# if there is no categorical data prepared data is original data
nodeIds = self._find_sample_nodes(prepared_data, level_nodes)
labels_df = pd.DataFrame(labelFeature, columns=['label'])
for cell in self.cells_:
for cell in self.cells:
cell['representative'] = {}
# get all rows in cell
indexes = [i for i, x in enumerate(nodeIds) if x == cell['id']]
@ -720,16 +675,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cell['representative'][feature] = row[feature]
def _find_sample_nodes(self, samples, nodes):
paths = self.dt_.decision_path(samples).toarray()
paths = self._dt.decision_path(samples).toarray()
nodeSet = set(nodes)
return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths]
def _generalize(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
mapping_to_cells = self._map_to_cells(prepared_data, level_nodes, cells_by_id)
all_indexes = []
for i in range(len(cells)):
# get the indexes of all records that map to this cell
indexes = [j for j in mapping_to_cells if mapping_to_cells[j]['id'] == cells[i]['id']]
all_indexes.append(indexes)
return self._generalize_indexes(original_data, cells, all_indexes)
def _generalize_indexes(self, original_data, cells, all_indexes):
# prepared data include one hot encoded categorical data + QI
representatives = pd.DataFrame(columns=self._features) # empty except for columns
generalized = pd.DataFrame(prepared_data, columns=self.categorical_data.columns, copy=True)
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
mapping_to_cells = self._map_to_cells(generalized, level_nodes, cells_by_id)
# iterate over cells (leaves in decision tree)
for i in range(len(cells)):
# This code just copies the representatives from the cells into another data structure
@ -745,9 +708,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
elif feature in representatives.columns.tolist():
representatives = representatives.drop(feature, axis=1)
# get the indexes of all records that map to this cell
indexes = [j for j in mapping_to_cells if mapping_to_cells[j]['id'] == cells[i]['id']]
indexes = all_indexes[i]
# replaces the values in the representative columns with the representative values
# (leaves others untouched)
if indexes and not representatives.columns.empty:
@ -780,7 +741,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
current_accuracy)
if feature is None:
return None
GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
GeneralizeToRepresentative._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
return feature
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy):
@ -788,7 +749,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# if there is no categorical data prepared data is original data
# We want to remove features with low iLoss (NCP) and high accuracy gain
# (after removing them)
ranges = self.generalizations_['ranges']
ranges = self._generalizations['ranges']
range_counts = self._find_range_count(original_data, ranges)
total = prepared_data.size
range_min = sys.float_info.max
@ -797,18 +758,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
category_counts = self._find_categories_count(original_data, categories)
for feature in ranges.keys():
if feature not in self.generalizations_['untouched']:
if feature not in self._generalizations['untouched']:
feature_ncp = self._calc_ncp_numeric(ranges[feature],
range_counts[feature],
feature_data[feature],
total)
if feature_ncp > 0:
# divide by accuracy gain
new_cells = copy.deepcopy(self.cells_)
cells_by_id = copy.deepcopy(self.cells_by_id_)
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
labels)) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
@ -827,11 +788,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
total)
if feature_ncp > 0:
# divide by accuracy loss
new_cells = copy.deepcopy(self.cells_)
cells_by_id = copy.deepcopy(self.cells_by_id_)
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
labels)) - current_accuracy
if accuracy_gain < 0:
@ -846,12 +807,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return remove_feature
def _calculate_generalizations(self):
self.generalizations_ = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells_),
'categories': GeneralizeToRepresentative._calculate_categories(self.cells_),
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells_)}
self._generalizations = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells),
'categories': GeneralizeToRepresentative._calculate_categories(self.cells),
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)}
self._remove_categorical_untouched(self._generalizations)
def _find_range_count(self, samples, ranges):
samples_df = pd.DataFrame(samples, columns=self.categorical_data.columns)
samples_df = pd.DataFrame(samples, columns=self._encoded_features)
range_counts = {}
last_value = None
for r in ranges.keys():
@ -1005,3 +967,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
del cell['categories'][feature]
cell['untouched'].append(feature)
cells_by_id[cell['id']] = cell.copy()
@staticmethod
def _remove_categorical_untouched(generalizations):
to_remove = []
for feature in generalizations['categories'].keys():
category_sizes = [len(g) if len(g) > 1 else 0 for g in generalizations['categories'][feature]]
if sum(category_sizes) == 0:
if 'untouched' not in generalizations:
generalizations['untouched'] = []
generalizations['untouched'].append(feature)
to_remove.append(feature)
for feature in to_remove:
del generalizations['categories'][feature]

View file

@ -6,6 +6,17 @@ from os import path, mkdir
from six.moves.urllib.request import urlretrieve
def get_iris_dataset_np(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1). Default is 0.3
:type test_set: float
:return: Entire dataset and labels as numpy arrays. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
return _load_iris(test_set)
def _load_iris(test_set_size: float = 0.3):
iris = datasets.load_iris()
data = iris.data
@ -18,14 +29,15 @@ def _load_iris(test_set_size: float = 0.3):
return (x_train, y_train), (x_test, y_test)
def get_iris_dataset(test_set: float = 0.3):
def get_diabetes_dataset_np(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
Loads the Diabetes dataset from scikit-learn.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Entire dataset and labels as numpy array.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1). Default is 0.3
:type test_set: float
:return: Entire dataset and labels as numpy arrays. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
return _load_iris(test_set)
return _load_diabetes(test_set)
def _load_diabetes(test_set_size: float = 0.3):
@ -40,22 +52,14 @@ def _load_diabetes(test_set_size: float = 0.3):
return (x_train, y_train), (x_test, y_test)
def get_diabetes_dataset():
def get_german_credit_dataset_pd(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
Loads the UCI German credit dataset from `tests/datasets/german` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/ if necessary.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Entire dataset and labels as numpy array.
"""
return _load_diabetes()
def get_german_credit_dataset(test_set: float = 0.3):
"""
Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Dataset and labels as pandas dataframes.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1). Default is 0.3
:type test_set: float
:return: Dataset and labels as pandas dataframes. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
@ -118,15 +122,21 @@ def _modify_german_dataset(data):
return 1
else:
raise Exception('Bad value')
def modify_label(value):
return value - 1
data['Foreign_worker'] = data['Foreign_worker'].apply(modify_Foreign_worker)
data['Telephone'] = data['Telephone'].apply(modify_Telephone)
data['label'] = data['label'].apply(modify_label)
def get_adult_dataset():
def get_adult_dataset_pd():
"""
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary.
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ if necessary.
:return: Dataset and labels as pandas dataframes.
:return: Dataset and labels as pandas dataframes. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
@ -223,17 +233,22 @@ def _modify_adult_dataset(data):
return data.drop(['fnlwgt', 'education'], axis=1)
def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
def get_nursery_dataset_pd(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
"""
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it if necessary.
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/ if necessary.
:param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot
encoded and data is scaled using sklearn's StandardScaler.
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1.
:type raw: boolean
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1. Default is
0.2
:type test_set: float
:param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute
inference. This is done by assigning the original value 'problematic' the new value 1, and
the other original values are assigned the new value 0.
:return: Dataset and labels as pandas dataframes.
:type transform_social: boolean
:return: Dataset and labels as pandas dataframes. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data'
data_dir = '../datasets/nursery'

View file

@ -4,4 +4,4 @@ Implementation of datasets utility components for datasets creation, load, and s
"""
from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, \
OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE
DatasetWithPredictions, OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE

View file

@ -5,7 +5,7 @@ Implementation of utility classes for dataset handling
"""
from abc import ABCMeta, abstractmethod
from typing import Callable, Collection, Any, Union, List, Optional
from typing import Callable, Collection, Any, Union, List, Optional, Type
import tarfile
import os
@ -66,36 +66,68 @@ class Dataset(metaclass=ABCMeta):
@abstractmethod
def get_samples(self) -> Collection[Any]:
"""Return data samples"""
pass
"""
Return data samples
:return: the data samples
"""
raise NotImplementedError
@abstractmethod
def get_labels(self) -> Collection[Any]:
"""Return labels"""
pass
"""
Return labels
:return: the labels
"""
raise NotImplementedError
@abstractmethod
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions
:return: predictions as numpy array
"""
raise NotImplementedError
class StoredDataset(Dataset):
"""Abstract Class for Storable Dataset"""
"""Abstract Class for a Dataset that can be downloaded from a URL and stored in a file"""
@abstractmethod
def load_from_file(self, path: str):
"""Load dataset from file"""
pass
"""
Load dataset from file
:param path: the path to the file
:type path: string
:return: None
"""
raise NotImplementedError
@abstractmethod
def load(self, **kwargs):
"""Load dataset"""
pass
"""
Load dataset
:return: None
"""
raise NotImplementedError
@staticmethod
def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None:
def download(url: str, dest_path: str, filename: str, unzip: Optional[bool] = False) -> None:
"""
Download the dataset from URL
:param url: dataset URL, the dataset will be requested from this URL
:type url: string
:param dest_path: local dataset destination path
:type dest_path: string
:param filename: local dataset filename
:param unzip: flag whether or not perform extraction
:type filename: string
:param unzip: flag whether or not perform extraction. Default is False.
:type unzip: boolean, optional
:return: None
"""
file_path = os.path.join(dest_path, filename)
@ -113,12 +145,16 @@ class StoredDataset(Dataset):
StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
@staticmethod
def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
def extract_archive(zip_path: str, dest_path: Optional[str] = None, remove_archive: Optional[bool] = False):
"""
Extract dataset from archived file
:param zip_path: path to archived file
:type zip_path: string
:param dest_path: directory path to uncompress the file to
:param remove_archive: whether remove the archive file after uncompress (default False)
:type dest_path: string, optional
:param remove_archive: whether remove the archive file after uncompress. Default is False.
:type remove_archive: boolean, optional
:return: None
"""
logger.info("Extracting the dataset...")
@ -132,15 +168,23 @@ class StoredDataset(Dataset):
logger.info("Extracted the dataset")
@staticmethod
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None:
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle: Optional[bool] = True,
delimiter: Optional[str] = ",", fmt: Optional[Union[str, list]] = None) -> None:
"""
Split the data and take only a part of it
:param datafile: dataset file path
:type datafile: string
:param dest_datafile: destination path for the partial dataset file
:type dest_datafile: string
:param ratio: part of the dataset to save
:param shuffle: whether to shuffle the data or not (default True)
:param delimiter: dataset delimiter (default ",")
:param fmt: format for the correct data saving
:type ratio: int
:param shuffle: whether to shuffle the data or not. Default is True.
:type shuffle: boolean, optional
:param delimiter: dataset delimiter. Default is ","
:type delimiter: string, optional
:param fmt: format for the correct data saving. As defined by numpy.savetxt(). Default is None.
:type fmt: string or sequence of strings, optional
:return: None
"""
if os.path.isfile(dest_datafile):
@ -160,22 +204,19 @@ class StoredDataset(Dataset):
class ArrayDataset(Dataset):
"""Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
"""
Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)
def __init__(
self,
x: INPUT_DATA_ARRAY_TYPE,
y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
features_names: Optional = None,
**kwargs,
):
"""
ArrayDataset constructor.
:param x: collection of data samples
:param y: collection of labels (optional)
:param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
:param kwargs: dataset parameters
"""
:param x: collection of data samples
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
:param y: collection of labels
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
:param feature_names: The feature names, in the order that they appear in the data
:type feature_names: list of strings, optional
"""
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
features_names: Optional[list] = None, **kwargs):
self.is_pandas = self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series
self.features_names = features_names
@ -187,26 +228,100 @@ class ArrayDataset(Dataset):
raise ValueError("The supplied features are not the same as in the data features")
self.features_names = x.columns.to_list()
if y is not None and len(self._x) != len(self._y):
if self._y is not None and len(self._x) != len(self._y):
raise ValueError("Non equivalent lengths of x and y")
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return data samples as numpy array"""
"""
Get data samples
:return: data samples as numpy array
"""
return self._x
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return labels as numpy array"""
"""
Get labels
:return: labels as numpy array
"""
return self._y
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions
:return: predictions as numpy array
"""
return None
class DatasetWithPredictions(Dataset):
"""
Dataset that is based on arrays (e.g., numpy/pandas/list...). Includes predictions from a model, and possibly also
features and true labels.
:param x: collection of data samples
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
:param y: collection of labels
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
:param feature_names: The feature names, in the order that they appear in the data
:type feature_names: list of strings, optional
"""
def __init__(self, pred: INPUT_DATA_ARRAY_TYPE, x: Optional[INPUT_DATA_ARRAY_TYPE] = None,
y: Optional[INPUT_DATA_ARRAY_TYPE] = None, features_names: Optional[list] = None, **kwargs):
self.is_pandas = False
self.features_names = features_names
self._pred = self._array2numpy(pred)
self._y = self._array2numpy(y) if y is not None else None
self._x = self._array2numpy(x) if x is not None else None
if self.is_pandas and x is not None:
if features_names and not np.array_equal(features_names, x.columns):
raise ValueError("The supplied features are not the same as in the data features")
self.features_names = x.columns.to_list()
if self._y is not None and len(self._pred) != len(self._y):
raise ValueError('Non equivalent lengths of pred and y')
if self._x is not None and len(self._x) != len(self._pred):
raise ValueError('Non equivalent lengths of x and pred')
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get data samples
:return: data samples as numpy array
"""
return self._x
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get labels
:return: labels as numpy array
"""
return self._y
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions
:return: predictions as numpy array
"""
return self._pred
class PytorchData(Dataset):
"""
Dataset for pytorch models.
:param x: collection of data samples
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
:param y: collection of labels
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
"""
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
"""
PytorchData constructor.
:param x: collection of data samples
:param y: collection of labels (optional)
:param kwargs: dataset parameters
"""
self._y = array2torch_tensor(y) if y is not None else None
self._x = array2torch_tensor(x)
@ -215,7 +330,7 @@ class PytorchData(Dataset):
if self.is_pandas:
self.features_names = x.columns
if y is not None and len(self._x) != len(self._y):
if self._y is not None and len(self._x) != len(self._y):
raise ValueError("Non equivalent lengths of x and y")
if self._y is not None:
@ -224,17 +339,47 @@ class PytorchData(Dataset):
self.__getitem__ = self.get_sample_item
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return data samples as numpy array"""
"""
Get data samples.
:return: samples as numpy array
"""
return array2numpy(self._x)
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return labels as numpy array"""
"""
Get labels.
:return: labels as numpy array
"""
return array2numpy(self._y) if self._y is not None else None
def get_sample_item(self, idx) -> Tensor:
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions
:return: predictions as numpy array
"""
return None
def get_sample_item(self, idx: int) -> Tensor:
"""
Get the sample according to the given index
:param idx: the index of the sample to return
:type idx: int
:return: the sample as a pytorch Tensor
"""
return self._x[idx]
def get_item(self, idx) -> Tensor:
def get_item(self, idx: int) -> Tensor:
"""
Get the sample and label according to the given index
:param idx: the index of the sample to return
:type idx: int
:return: the sample and label as pytorch Tensors. Returned as a tuple (sample, label)
"""
sample, label = self._x[idx], self._y[idx]
return sample, label
@ -251,11 +396,13 @@ class DatasetFactory:
def register(cls, name: str) -> Callable:
"""
Class method to register Dataset to the internal registry
:param name: dataset name
:return:
:type name: string
:return: a Callable that returns the registered dataset class
"""
def inner_wrapper(wrapped_class: Dataset) -> Any:
def inner_wrapper(wrapped_class: Type[Dataset]) -> Any:
if name in cls.registry:
logger.warning("Dataset %s already exists. Will replace it", name)
cls.registry[name] = wrapped_class
@ -267,11 +414,15 @@ class DatasetFactory:
def create_dataset(cls, name: str, **kwargs) -> Dataset:
"""
Factory command to create dataset instance.
This method gets the appropriate Dataset class from the registry
and creates an instance of it, while passing in the parameters
given in ``kwargs``.
:param name: The name of the dataset to create.
:type name: string
:param kwargs: dataset parameters
:type kwargs: keyword arguments as expected by the class
:return: An instance of the dataset that is created.
"""
if name not in cls.registry:
@ -285,13 +436,19 @@ class DatasetFactory:
class Data:
def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
"""
Class for storing train and test datasets.
:param train: the training set
:type train: `Dataset`
:param test: the test set
:type test: `Dataset`, optional
"""
def __init__(self, train: Dataset = None, test: Optional[Dataset] = None, **kwargs):
"""
Data class constructor.
The class stores train and test datasets.
If neither of the datasets was provided,
Both train and test datasets will be create using
DatasetFactory to create a dataset instance
If neither of the datasets was provided, both train and test datasets will be created using `DatasetFactory`.
"""
if train or test:
self.train = train
@ -301,25 +458,77 @@ class Data:
self.test = DatasetFactory.create_dataset(train=False, **kwargs)
def get_train_set(self) -> Dataset:
"""Return train DatasetBase"""
"""
Get training set
:return: training 'Dataset`
"""
return self.train
def get_test_set(self) -> Dataset:
"""Return test DatasetBase"""
"""
Get test set
:return: test 'Dataset`
"""
return self.test
def get_train_samples(self) -> Collection[Any]:
"""Return train set samples"""
"""
Get train set samples, or None if no training data provided
:return: training samples
"""
if self.train is None:
return None
return self.train.get_samples()
def get_train_labels(self) -> Collection[Any]:
"""Return train set labels"""
"""
Get train set labels, or None if no training labels provided
:return: training labels
"""
if self.train is None:
return None
return self.train.get_labels()
def get_train_predictions(self) -> Collection[Any]:
"""
Get train set predictions, or None if no training predictions provided
:return: training labels
"""
if self.train is None:
return None
return self.train.get_predictions()
def get_test_samples(self) -> Collection[Any]:
"""Return test set samples"""
"""
Get test set samples
:return: test samples, or None if no test data provided
"""
if self.test is None:
return None
return self.test.get_samples()
def get_test_labels(self) -> Collection[Any]:
"""Return test set labels"""
"""
Get test set labels
:return: test labels, or None if no test labels provided
"""
if self.test is None:
return None
return self.test.get_labels()
def get_test_predictions(self) -> Collection[Any]:
"""
Get test set predictions, or None if no test predictions provided
:return: test labels
"""
if self.test is None:
return None
return self.test.get_predictions()

View file

@ -1,2 +1,6 @@
from apt.utils.models.model import Model, ModelOutputType
from apt.utils.models.model import Model, BlackboxClassifier, ModelOutputType, ScoringMethod, \
BlackboxClassifierPredictions, BlackboxClassifierPredictFunction, get_nb_classes, is_one_hot, \
check_correct_model_output
from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor
from apt.utils.models.keras_model import KerasClassifier, KerasRegressor
from apt.utils.models.xgboost_model import XGBoostClassifier

View file

@ -0,0 +1,151 @@
from typing import Optional
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow import keras
tf.compat.v1.disable_eager_execution()
from sklearn.metrics import mean_squared_error
from apt.utils.models import Model, ModelOutputType, ScoringMethod, check_correct_model_output
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from art.utils import check_and_transform_label_format
from art.estimators.classification.keras import KerasClassifier as ArtKerasClassifier
from art.estimators.regression.keras import KerasRegressor as ArtKerasRegressor
class KerasModel(Model):
"""
Wrapper class for keras models.
"""
class KerasClassifier(KerasModel):
"""
Wrapper class for keras classification models.
:param model: The original keras model object.
:type model: `keras.models.Model`
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: keras.models.Model, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
logits = False
if output_type == ModelOutputType.CLASSIFIER_LOGITS:
logits = True
self._art_model = ArtKerasClassifier(model, use_logits=logits)
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
labels (consecutive integers starting at 0).
:type train_data: `Dataset`
:return: None
"""
y_encoded = check_and_transform_label_format(train_data.get_labels(), self._art_model.nb_classes)
self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array (class probabilities, if supported).
"""
predictions = self._art_model.predict(x.get_samples(), **kwargs)
check_correct_model_output(predictions, self.output_type)
return predictions
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:param scoring_method: The method for scoring predictions. Default is ACCURACY.
:type scoring_method: `ScoringMethod`, optional
:return: the score as float (between 0 and 1)
"""
y = check_and_transform_label_format(test_data.get_labels(), self._art_model.nb_classes)
predicted = self.predict(test_data)
if scoring_method == ScoringMethod.ACCURACY:
return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
else:
raise NotImplementedError
class KerasRegressor(KerasModel):
"""
Wrapper class for keras regression models.
:param model: The original keras model object.
:type model: `keras.models.Model`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: keras.models.Model, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
self._art_model = ArtKerasRegressor(model)
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data.
:type train_data: `Dataset`
:return: None
"""
self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
return self._art_model.predict(x.get_samples(), **kwargs)
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.MEAN_SQUARED_ERROR,
**kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:param scoring_method: The method for scoring predictions. Default is ACCURACY.
:type scoring_method: `ScoringMethod`, optional
:return: the score as float
"""
predicted = self.predict(test_data)
if scoring_method == ScoringMethod.MEAN_SQUARED_ERROR:
return mean_squared_error(test_data.get_labels(), predicted)
else:
raise NotImplementedError('Only MEAN_SQUARED_ERROR supported as scoring method')

View file

@ -1,37 +1,91 @@
from abc import ABCMeta, abstractmethod
from typing import Any, Optional
from typing import Any, Optional, Callable, Tuple, Union
from enum import Enum, auto
import numpy as np
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from apt.utils.datasets import Dataset, Data, OUTPUT_DATA_ARRAY_TYPE
from art.estimators.classification import BlackBoxClassifier
from art.utils import check_and_transform_label_format
class ModelOutputType(Enum):
CLASSIFIER_VECTOR = auto() # probabilities or logits
CLASSIFIER_PROBABILITIES = auto() # vector of probabilities
CLASSIFIER_LOGITS = auto() # vector of logits
CLASSIFIER_SCALAR = auto() # label only
REGRESSOR_SCALAR = auto() # value
class ModelType(Enum):
SKLEARN_DECISION_TREE = auto()
SKLEARN_GRADIENT_BOOSTING = auto()
class ScoringMethod(Enum):
ACCURACY = auto() # number of correct predictions divided by the number of samples
MEAN_SQUARED_ERROR = auto() # mean squared error between the predictions and true labels
def is_one_hot(y: OUTPUT_DATA_ARRAY_TYPE) -> bool:
return len(y.shape) == 2 and y.shape[1] > 1
def get_nb_classes(y: OUTPUT_DATA_ARRAY_TYPE) -> int:
"""
Get the number of classes from an array of labels
:param y: The labels
:type y: numpy array
:return: The number of classes as integer
"""
if y is None:
return 0
if type(y) != np.ndarray:
raise ValueError("Input should be numpy array")
if is_one_hot(y):
return y.shape[1]
else:
return int(np.max(y) + 1)
def check_correct_model_output(y: OUTPUT_DATA_ARRAY_TYPE, output_type: ModelOutputType):
"""
Checks whether there is a mismatch between the declared model output type and its actual output.
:param y: Model output
:type y: numpy array
:param output_type: Declared output type (provided at init)
:type output_type: ModelOutputType
:raises: ValueError (in case of mismatch)
"""
if not is_one_hot(y): # 1D array
if output_type == ModelOutputType.CLASSIFIER_PROBABILITIES or output_type == ModelOutputType.CLASSIFIER_LOGITS:
raise ValueError("Incompatible model output types. Model outputs 1D array of categorical scalars while "
"output type is set to ", output_type)
class Model(metaclass=ABCMeta):
"""
Abstract base class for ML model wrappers.
:param model: The original model object (of the underlying ML framework)
:type model: framework-specific model object
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `Model` wrapper object.
:param model: The original model object (of the underlying ML framework)
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
self._model = model
self._output_type = output_type
self._black_box_access = black_box_access
@ -53,8 +107,8 @@ class Model(metaclass=ABCMeta):
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:return: Predictions from the model.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
raise NotImplementedError
@ -65,13 +119,14 @@ class Model(metaclass=ABCMeta):
:param test_data: Test data.
:type train_data: `Dataset`
:return: the score as float (for classifiers, between 0 and 1)
"""
return NotImplementedError
@property
def model(self) -> Any:
"""
Return the model.
Return the underlying model.
:return: The model.
"""
@ -89,21 +144,223 @@ class Model(metaclass=ABCMeta):
@property
def black_box_access(self) -> bool:
"""
Return True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals are also available.
Return whether the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, or if the model internals are also available.
:return: True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals are also available.
:return: True if the model is only available via query (API) access, otherwise False.
"""
return self._black_box_access
@property
def unlimited_queries(self) -> bool:
"""
If black_box_access is True, Return whether a user can perform unlimited queries to the model API
If black_box_access is True, return whether a user can perform unlimited queries to the model API
or whether there is a limit to the number of queries that can be submitted.
:return: If black_box_access is True, Return whether a user can perform unlimited queries to the model API
or whether there is a limit to the number of queries that can be submitted.
:return: True if a user can perform unlimited queries to the model API, otherwise False.
"""
return self._unlimited_queries
class BlackboxClassifier(Model):
"""
Wrapper for black-box ML classification models.
:param model: The training and/or test data along with the model's predictions for the data or a callable predict
method.
:type model: `Data` object or Callable
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Always assumed to be True (black box) for this wrapper.
:type black_box_access: boolean, optional
:param unlimited_queries: Boolean indicating whether a user can perform unlimited queries to the model API.
:type unlimited_queries: boolean, optional
:param model_type: The type of model this BlackboxClassifier represents. Needed in order to build and/or fit
similar dummy/shadow models.
:type model_type: Either a (unfitted) model object of the underlying framework, or a ModelType representing the
type of the model, optional.
"""
def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, model_type: Optional[Union[Any, ModelType]] = None,
**kwargs):
super().__init__(model, output_type, black_box_access=True, unlimited_queries=unlimited_queries, **kwargs)
self._nb_classes = None
self._input_shape = None
self._model_type = model_type
@property
def nb_classes(self) -> int:
"""
Return the number of prediction classes of the model.
:return: Number of prediction classes of the model.
"""
return self._nb_classes
@property
def input_shape(self) -> Tuple[int, ...]:
"""
Return the shape of input to the model.
:return: Shape of input to the model.
"""
return self._input_shape
@property
def model_type(self) -> Optional[Union[Any, ModelType]]:
"""
Return the type of the model.
:return: Either a (unfitted) model object of the underlying framework, or a ModelType representing the type of
the model, or None (of none provided at init).
"""
return self._model_type
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
A blackbox model cannot be fit.
"""
raise NotImplementedError
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions from the model for input `x`. `x` must be a subset of the data provided in the `model` data in
`__init__()`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
predictions = self._art_model.predict(x.get_samples())
check_correct_model_output(predictions, self.output_type)
return predictions
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:param scoring_method: The method for scoring predictions. Default is ACCURACY.
:type scoring_method: `ScoringMethod`, optional
:return: the score as float (for classifiers, between 0 and 1)
"""
if test_data.get_samples() is None or test_data.get_labels() is None:
raise ValueError('score can only be computed when test data and labels are available')
predicted = self._art_model.predict(test_data.get_samples())
y = check_and_transform_label_format(test_data.get_labels(), nb_classes=self._nb_classes)
if scoring_method == ScoringMethod.ACCURACY:
return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
else:
raise NotImplementedError
class BlackboxClassifierPredictions(BlackboxClassifier):
"""
Wrapper for black-box ML classification models using data and predictions.
:param model: The training and/or test data along with the model's predictions for the data. Assumes that the data
is represented as numpy arrays. Labels are expected to either be class probabilities (multi-column) or
a 1D-array of categorical labels (consecutive integers starting at 0).
:type model: `Data` object
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Always assumed to be True for this wrapper.
:type black_box_access: boolean, optional
:param unlimited_queries: Boolean indicating whether a user can perform unlimited queries to the model API.
Always assumed to be False for this wrapper.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: Data, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access=True, unlimited_queries=False, **kwargs)
x_train_pred = model.get_train_samples()
y_train_pred = model.get_train_predictions()
if y_train_pred is None:
y_train_pred = model.get_train_labels()
x_test_pred = model.get_test_samples()
y_test_pred = model.get_test_predictions()
if y_test_pred is None:
y_test_pred = model.get_test_labels()
if y_train_pred is not None:
check_correct_model_output(y_train_pred, self.output_type)
if y_test_pred is not None:
check_correct_model_output(y_test_pred, self.output_type)
if y_train_pred is not None and len(y_train_pred.shape) == 1:
self._nb_classes = get_nb_classes(y_train_pred)
y_train_pred = check_and_transform_label_format(y_train_pred, nb_classes=self._nb_classes)
if y_test_pred is not None and len(y_test_pred.shape) == 1:
if self._nb_classes is None:
self._nb_classes = get_nb_classes(y_test_pred)
y_test_pred = check_and_transform_label_format(y_test_pred, nb_classes=self._nb_classes)
if x_train_pred is not None and y_train_pred is not None and x_test_pred is not None and y_test_pred is not None:
if type(y_train_pred) != np.ndarray or type(y_test_pred) != np.ndarray \
or type(y_train_pred) != np.ndarray or type(y_test_pred) != np.ndarray:
raise NotImplementedError("X/Y Data should be numpy array")
x_pred = np.vstack((x_train_pred, x_test_pred))
y_pred = np.vstack((y_train_pred, y_test_pred))
elif x_test_pred is not None and y_test_pred is not None:
x_pred = x_test_pred
y_pred = y_test_pred
elif x_train_pred is not None and y_train_pred is not None:
x_pred = x_train_pred
y_pred = y_train_pred
else:
raise NotImplementedError("Invalid data - None")
self._nb_classes = get_nb_classes(y_pred)
self._input_shape = x_pred.shape[1:]
self._x_pred = x_pred
self._y_pred = y_pred
predict_fn = (x_pred, y_pred)
self._art_model = BlackBoxClassifier(predict_fn, self._input_shape, self._nb_classes, fuzzy_float_compare=True,
preprocessing=None)
def get_predictions(self) -> Tuple[OUTPUT_DATA_ARRAY_TYPE, OUTPUT_DATA_ARRAY_TYPE]:
"""
Return all the data for which the model contains predictions.
:return: Tuple containing data and predictions as numpy arrays.
"""
return self._x_pred, self._y_pred
class BlackboxClassifierPredictFunction(BlackboxClassifier):
"""
Wrapper for black-box ML classification models using a predict function.
:param model: Function that takes in an `np.ndarray` of input data and returns predictions either as class
probabilities (multi-column) or a 1D-array of categorical labels (consecutive integers starting at 0).
:type model: Callable
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param input_shape: Shape of input to the model.
:type input_shape: Tuple[int, ...]
:param nb_classes: Number of prediction classes of the model.
:type nb_classes: int
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Always assumed to be True for this wrapper.
:type black_box_access: boolean, optional
:param unlimited_queries: Boolean indicating whether a user can perform unlimited queries to the model API.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: Callable, output_type: ModelOutputType, input_shape: Tuple[int, ...], nb_classes: int,
black_box_access: Optional[bool] = True, unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access=True, unlimited_queries=unlimited_queries, **kwargs)
self._nb_classes = nb_classes
self._input_shape = input_shape
def predict_wrapper(x):
predictions = self.model(x)
if not is_one_hot(predictions):
predictions = check_and_transform_label_format(predictions, nb_classes=nb_classes, return_one_hot=True)
return predictions
self._art_model = BlackBoxClassifier(predict_wrapper, self._input_shape, self._nb_classes, preprocessing=None)

View file

@ -1,15 +1,14 @@
from typing import Optional
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
from apt.utils.models import Model, ModelOutputType
from apt.utils.models import Model, ModelOutputType, get_nb_classes, check_correct_model_output
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
from art.estimators.regression.scikitlearn import ScikitlearnRegressor
from art.utils import check_and_transform_label_format
class SklearnModel(Model):
@ -22,6 +21,7 @@ class SklearnModel(Model):
:param test_data: Test data.
:type train_data: `Dataset`
:return: the score as float (for classifiers, between 0 and 1)
"""
return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs)
@ -29,23 +29,23 @@ class SklearnModel(Model):
class SklearnClassifier(SklearnModel):
"""
Wrapper class for scikitlearn classification models.
:param model: The original sklearn model object.
:type model: scikitlearn classifier object
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `SklearnClassifier` wrapper object.
:param model: The original sklearn model object.
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
self._art_model = ArtSklearnClassifier(model)
@ -53,11 +53,14 @@ class SklearnClassifier(SklearnModel):
"""
Fit the model using the training data.
:param train_data: Training data.
:param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
labels (consecutive integers starting at 0).
:type train_data: `Dataset`
:return: None
"""
encoder = OneHotEncoder(sparse=False)
y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
y = train_data.get_labels()
self.nb_classes = get_nb_classes(y)
y_encoded = check_and_transform_label_format(y, nb_classes=self.nb_classes)
self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
@ -65,30 +68,32 @@ class SklearnClassifier(SklearnModel):
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:return: Predictions from the model (class probabilities, if supported).
:type x: `Dataset`
:return: Predictions from the model as numpy array (class probabilities, if supported).
"""
return self._art_model.predict(x, **kwargs)
predictions = self._art_model.predict(x.get_samples(), **kwargs)
check_correct_model_output(predictions, self.output_type)
return predictions
class SklearnRegressor(SklearnModel):
"""
Wrapper class for scikitlearn regression models.
:param model: The original sklearn model object.
:type model: scikitlearn regressor object
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: BaseEstimator, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `SklearnRegressor` wrapper object.
:param model: The original sklearn model object.
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
self._art_model = ScikitlearnRegressor(model)
@ -98,6 +103,7 @@ class SklearnRegressor(SklearnModel):
:param train_data: Training data.
:type train_data: `Dataset`
:return: None
"""
self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
@ -106,7 +112,7 @@ class SklearnRegressor(SklearnModel):
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:return: Predictions from the model.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
return self._art_model.predict(x, **kwargs)
return self._art_model.predict(x.get_samples(), **kwargs)

View file

@ -0,0 +1,87 @@
from typing import Optional, Tuple
from apt.utils.models import Model, ModelOutputType, ScoringMethod, check_correct_model_output, is_one_hot
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from xgboost import XGBClassifier
import numpy as np
from art.estimators.classification.xgboost import XGBoostClassifier as ArtXGBoostClassifier
class XGBoostModel(Model):
"""
Wrapper class for xgboost models.
"""
class XGBoostClassifier(XGBoostModel):
"""
Wrapper class for xgboost classification models.
:param model: The original xgboost model object. Must be fit.
:type model: Booster or XGBClassifier object
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param input_shape: Shape of input to the model.
:type input_shape: Tuple[int, ...]
:param nb_classes: Number of prediction classes of the model.
:type nb_classes: int
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: XGBClassifier, output_type: ModelOutputType, input_shape: Tuple[int, ...],
nb_classes: int,black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
self._art_model = ArtXGBoostClassifier(model, nb_features=input_shape[0], nb_classes=nb_classes)
self.nb_classes = nb_classes
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
labels (consecutive integers starting at 0).
:type train_data: `Dataset`
:return: None
"""
self._art_model._model.fit(train_data.get_samples(), train_data.get_labels())
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array (class probabilities, if supported).
"""
predictions = self._art_model.predict(x.get_samples(), **kwargs)
check_correct_model_output(predictions, self.output_type)
return predictions
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:return: the score as float (for classifiers, between 0 and 1)
"""
y = test_data.get_labels()
predicted = self.predict(test_data)
if is_one_hot(predicted):
predicted = np.argmax(predicted, axis=1)
if is_one_hot(y):
y = np.argmax(y, axis=1)
if scoring_method == ScoringMethod.ACCURACY:
return np.count_nonzero(y == predicted) / predicted.shape[0]
else:
raise NotImplementedError

View file

@ -22,7 +22,7 @@ copyright = '2021, IBM'
author = 'Abigail Goldsteen'
# The full version, including alpha/beta/rc tags
release = '0.0.4'
release = '0.1.0'
master_doc = 'index'
@ -32,8 +32,11 @@ master_doc = 'index'
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.duration',
'sphinx.ext.doctest',
'sphinx.ext.autodoc',
'sphinx.ext.napoleon'
'sphinx.ext.autosummary',
'sphinx.ext.intersphinx',
]
# Add any paths that contain templates here, relative to this directory.
@ -50,7 +53,7 @@ exclude_patterns = []
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'
html_theme = 'pyramid'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,

View file

@ -8,15 +8,15 @@ apt.anonymization.anonymizer module
-----------------------------------
.. automodule:: apt.anonymization.anonymizer
:members:
:undoc-members:
:show-inheritance:
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: apt.anonymization
:members:
:undoc-members:
:show-inheritance:
:members:
:undoc-members:
:show-inheritance:

View file

@ -8,15 +8,15 @@ apt.minimization.minimizer module
---------------------------------
.. automodule:: apt.minimization.minimizer
:members:
:undoc-members:
:show-inheritance:
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: apt.minimization
:members:
:undoc-members:
:show-inheritance:
:members:
:undoc-members:
:show-inheritance:

View file

@ -5,26 +5,16 @@ Subpackages
-----------
.. toctree::
:maxdepth: 4
apt.anonymization
apt.minimization
Submodules
----------
apt.utils module
----------------
.. automodule:: apt.utils
:members:
:undoc-members:
:show-inheritance:
apt.anonymization
apt.minimization
apt.utils
Module contents
---------------
.. automodule:: apt
:members:
:undoc-members:
:show-inheritance:
:members:
:undoc-members:
:show-inheritance:

View file

@ -0,0 +1,22 @@
apt.utils.datasets package
==========================
Submodules
----------
apt.utils.datasets.datasets module
----------------------------------
.. automodule:: apt.utils.datasets.datasets
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: apt.utils.datasets
:members:
:undoc-members:
:show-inheritance:

View file

@ -0,0 +1,30 @@
apt.utils.models package
========================
Submodules
----------
apt.utils.models.model module
-----------------------------
.. automodule:: apt.utils.models.model
:members:
:undoc-members:
:show-inheritance:
apt.utils.models.sklearn\_model module
--------------------------------------
.. automodule:: apt.utils.models.sklearn_model
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: apt.utils.models
:members:
:undoc-members:
:show-inheritance:

31
docs/source/apt.utils.rst Normal file
View file

@ -0,0 +1,31 @@
apt.utils package
=================
Subpackages
-----------
.. toctree::
:maxdepth: 4
apt.utils.datasets
apt.utils.models
Submodules
----------
apt.utils.dataset\_utils module
-------------------------------
.. automodule:: apt.utils.dataset_utils
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: apt.utils
:members:
:undoc-members:
:show-inheritance:

View file

@ -1,8 +1,7 @@
ai-privacy-toolkit
==================
apt
===
.. toctree::
:maxdepth: 4
apt
tests

View file

@ -29,15 +29,198 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 121,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " parents has_nurs form children housing finance \\\n8450 pretentious very_crit foster 1 less_conv convenient \n12147 great_pret very_crit complete 1 critical inconv \n2780 usual critical complete 4 less_conv convenient \n11924 great_pret critical foster 1 critical convenient \n59 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n5193 pretentious less_proper complete 1 convenient inconv \n1375 usual less_proper incomplete 2 less_conv convenient \n10318 great_pret less_proper foster 4 convenient convenient \n6396 pretentious improper completed 3 less_conv convenient \n485 usual proper incomplete 1 critical inconv \n\n social health \n8450 1 not_recom \n12147 1 recommended \n2780 1 not_recom \n11924 1 not_recom \n59 0 not_recom \n... ... ... \n5193 0 recommended \n1375 1 priority \n10318 0 priority \n6396 1 recommended \n485 1 not_recom \n\n[10366 rows x 8 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>parents</th>\n <th>has_nurs</th>\n <th>form</th>\n <th>children</th>\n <th>housing</th>\n <th>finance</th>\n <th>social</th>\n <th>health</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>8450</th>\n <td>pretentious</td>\n <td>very_crit</td>\n <td>foster</td>\n <td>1</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>12147</th>\n <td>great_pret</td>\n <td>very_crit</td>\n <td>complete</td>\n <td>1</td>\n <td>critical</td>\n <td>inconv</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>2780</th>\n <td>usual</td>\n <td>critical</td>\n <td>complete</td>\n <td>4</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>11924</th>\n <td>great_pret</td>\n <td>critical</td>\n <td>foster</td>\n <td>1</td>\n <td>critical</td>\n <td>convenient</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>59</th>\n <td>usual</td>\n <td>proper</td>\n <td>complete</td>\n <td>2</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5193</th>\n <td>pretentious</td>\n <td>less_proper</td>\n <td>complete</td>\n <td>1</td>\n <td>convenient</td>\n <td>inconv</td>\n <td>0</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>1375</th>\n <td>usual</td>\n <td>less_proper</td>\n <td>incomplete</td>\n <td>2</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>10318</th>\n <td>great_pret</td>\n <td>less_proper</td>\n <td>foster</td>\n <td>4</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>6396</th>\n <td>pretentious</td>\n <td>improper</td>\n <td>completed</td>\n <td>3</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>485</th>\n <td>usual</td>\n <td>proper</td>\n <td>incomplete</td>\n <td>1</td>\n <td>critical</td>\n <td>inconv</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n </tbody>\n</table>\n<p>10366 rows × 8 columns</p>\n</div>"
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>parents</th>\n",
" <th>has_nurs</th>\n",
" <th>form</th>\n",
" <th>children</th>\n",
" <th>housing</th>\n",
" <th>finance</th>\n",
" <th>social</th>\n",
" <th>health</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8450</th>\n",
" <td>pretentious</td>\n",
" <td>very_crit</td>\n",
" <td>foster</td>\n",
" <td>1</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12147</th>\n",
" <td>great_pret</td>\n",
" <td>very_crit</td>\n",
" <td>complete</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>inconv</td>\n",
" <td>1</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2780</th>\n",
" <td>usual</td>\n",
" <td>critical</td>\n",
" <td>complete</td>\n",
" <td>4</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11924</th>\n",
" <td>great_pret</td>\n",
" <td>critical</td>\n",
" <td>foster</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>usual</td>\n",
" <td>proper</td>\n",
" <td>complete</td>\n",
" <td>2</td>\n",
" <td>convenient</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5193</th>\n",
" <td>pretentious</td>\n",
" <td>less_proper</td>\n",
" <td>complete</td>\n",
" <td>1</td>\n",
" <td>convenient</td>\n",
" <td>inconv</td>\n",
" <td>0</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1375</th>\n",
" <td>usual</td>\n",
" <td>less_proper</td>\n",
" <td>incomplete</td>\n",
" <td>2</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>priority</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10318</th>\n",
" <td>great_pret</td>\n",
" <td>less_proper</td>\n",
" <td>foster</td>\n",
" <td>4</td>\n",
" <td>convenient</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>priority</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6396</th>\n",
" <td>pretentious</td>\n",
" <td>improper</td>\n",
" <td>completed</td>\n",
" <td>3</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>485</th>\n",
" <td>usual</td>\n",
" <td>proper</td>\n",
" <td>incomplete</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>inconv</td>\n",
" <td>1</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10366 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" parents has_nurs form children housing finance \\\n",
"8450 pretentious very_crit foster 1 less_conv convenient \n",
"12147 great_pret very_crit complete 1 critical inconv \n",
"2780 usual critical complete 4 less_conv convenient \n",
"11924 great_pret critical foster 1 critical convenient \n",
"59 usual proper complete 2 convenient convenient \n",
"... ... ... ... ... ... ... \n",
"5193 pretentious less_proper complete 1 convenient inconv \n",
"1375 usual less_proper incomplete 2 less_conv convenient \n",
"10318 great_pret less_proper foster 4 convenient convenient \n",
"6396 pretentious improper completed 3 less_conv convenient \n",
"485 usual proper incomplete 1 critical inconv \n",
"\n",
" social health \n",
"8450 1 not_recom \n",
"12147 1 recommended \n",
"2780 1 not_recom \n",
"11924 1 not_recom \n",
"59 0 not_recom \n",
"... ... ... \n",
"5193 0 recommended \n",
"1375 1 priority \n",
"10318 0 priority \n",
"6396 1 recommended \n",
"485 1 not_recom \n",
"\n",
"[10366 rows x 8 columns]"
]
},
"execution_count": 1,
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
@ -47,9 +230,9 @@
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"\n",
"from apt.utils.dataset_utils import get_nursery_dataset\n",
"from apt.utils.dataset_utils import get_nursery_dataset_pd\n",
"\n",
"(x_train, y_train), (x_test, y_test) = get_nursery_dataset(transform_social=True)\n",
"(x_train, y_train), (x_test, y_test) = get_nursery_dataset_pd(transform_social=True)\n",
"\n",
"x_train"
]
@ -63,7 +246,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 122,
"metadata": {},
"outputs": [
{
@ -78,11 +261,25 @@
"from sklearn.tree import DecisionTreeClassifier\n",
"from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"x_train_str = x_train.astype(str)\n",
"train_encoded = OneHotEncoder(sparse=False).fit_transform(x_train_str)\n",
"x_test_str = x_test.astype(str)\n",
"test_encoded = OneHotEncoder(sparse=False).fit_transform(x_test_str)\n",
"numeric_features = ['social']\n",
"categorical_features = ['children', 'parents', 'has_nurs', 'form', 'housing', 'finance', 'health']\n",
"numeric_transformer = Pipeline(\n",
" steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
")\n",
"categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" (\"num\", numeric_transformer, numeric_features),\n",
" (\"cat\", categorical_transformer, categorical_features),\n",
" ]\n",
")\n",
"\n",
"train_encoded = preprocessor.fit_transform(x_train)\n",
"test_encoded = preprocessor.transform(x_test)\n",
" \n",
"model = DecisionTreeClassifier()\n",
"model.fit(train_encoded, y_train)\n",
@ -104,14 +301,15 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from art.attacks.inference.attribute_inference import AttributeInferenceBlackBox\n",
"\n",
"attack_feature = 20\n",
"# social feature after preprocessing\n",
"attack_feature = 0\n",
"\n",
"# training data without attacked feature\n",
"x_train_for_attack = np.delete(train_encoded, attack_feature, 1)\n",
@ -140,14 +338,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 124,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0\n"
"0.6000385876905268\n"
]
}
],
@ -155,7 +353,7 @@
"# get inferred values\n",
"values=[0, 1]\n",
"\n",
"inferred_train_bb = bb_attack.infer(x_train_for_attack[attack_train_size:], x_train_predictions[attack_train_size:], values=values)\n",
"inferred_train_bb = bb_attack.infer(x_train_for_attack[attack_train_size:], pred=x_train_predictions[attack_train_size:], values=values)\n",
"# check accuracy\n",
"train_acc = np.sum(inferred_train_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_bb)\n",
"print(train_acc)"
@ -165,7 +363,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This means that for 64% of the training set, the attacked feature is inferred correctly using this attack."
"This means that for 60% of the training set, the attacked feature is inferred correctly using this attack."
]
},
{
@ -178,14 +376,14 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 125,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5122515917422342\n"
"0.6980513216284006\n"
]
}
],
@ -225,15 +423,198 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 126,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": " parents has_nurs form children housing finance \\\n0 pretentious very_crit foster 1 less_conv convenient \n1 great_pret very_crit complete 1 critical inconv \n2 usual critical complete 4 less_conv convenient \n3 great_pret critical foster 1 critical convenient \n4 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n10361 pretentious less_proper complete 1 convenient inconv \n10362 usual less_proper incomplete 2 less_conv convenient \n10363 great_pret less_proper foster 4 convenient convenient \n10364 pretentious improper completed 3 less_conv convenient \n10365 usual proper incomplete 1 critical convenient \n\n social health \n0 0 not_recom \n1 1 recommended \n2 0 not_recom \n3 0 not_recom \n4 0 not_recom \n... ... ... \n10361 0 recommended \n10362 1 priority \n10363 0 priority \n10364 1 recommended \n10365 0 not_recom \n\n[10366 rows x 8 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>parents</th>\n <th>has_nurs</th>\n <th>form</th>\n <th>children</th>\n <th>housing</th>\n <th>finance</th>\n <th>social</th>\n <th>health</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>pretentious</td>\n <td>very_crit</td>\n <td>foster</td>\n <td>1</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>1</th>\n <td>great_pret</td>\n <td>very_crit</td>\n <td>complete</td>\n <td>1</td>\n <td>critical</td>\n <td>inconv</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>2</th>\n <td>usual</td>\n <td>critical</td>\n <td>complete</td>\n <td>4</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>3</th>\n <td>great_pret</td>\n <td>critical</td>\n <td>foster</td>\n <td>1</td>\n <td>critical</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>4</th>\n <td>usual</td>\n <td>proper</td>\n <td>complete</td>\n <td>2</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>10361</th>\n <td>pretentious</td>\n <td>less_proper</td>\n <td>complete</td>\n <td>1</td>\n <td>convenient</td>\n <td>inconv</td>\n <td>0</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>10362</th>\n <td>usual</td>\n <td>less_proper</td>\n <td>incomplete</td>\n <td>2</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>10363</th>\n <td>great_pret</td>\n <td>less_proper</td>\n <td>foster</td>\n <td>4</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>10364</th>\n <td>pretentious</td>\n <td>improper</td>\n <td>completed</td>\n <td>3</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>10365</th>\n <td>usual</td>\n <td>proper</td>\n <td>incomplete</td>\n <td>1</td>\n <td>critical</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n </tbody>\n</table>\n<p>10366 rows × 8 columns</p>\n</div>"
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>parents</th>\n",
" <th>has_nurs</th>\n",
" <th>form</th>\n",
" <th>children</th>\n",
" <th>housing</th>\n",
" <th>finance</th>\n",
" <th>social</th>\n",
" <th>health</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pretentious</td>\n",
" <td>very_crit</td>\n",
" <td>foster</td>\n",
" <td>1</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>great_pret</td>\n",
" <td>very_crit</td>\n",
" <td>complete</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>inconv</td>\n",
" <td>1</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>usual</td>\n",
" <td>critical</td>\n",
" <td>complete</td>\n",
" <td>4</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>great_pret</td>\n",
" <td>critical</td>\n",
" <td>foster</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>usual</td>\n",
" <td>proper</td>\n",
" <td>complete</td>\n",
" <td>2</td>\n",
" <td>convenient</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10361</th>\n",
" <td>pretentious</td>\n",
" <td>less_proper</td>\n",
" <td>complete</td>\n",
" <td>1</td>\n",
" <td>convenient</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10362</th>\n",
" <td>usual</td>\n",
" <td>less_proper</td>\n",
" <td>incomplete</td>\n",
" <td>2</td>\n",
" <td>less_conv</td>\n",
" <td>inconv</td>\n",
" <td>0</td>\n",
" <td>priority</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10363</th>\n",
" <td>great_pret</td>\n",
" <td>less_proper</td>\n",
" <td>foster</td>\n",
" <td>4</td>\n",
" <td>convenient</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>priority</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10364</th>\n",
" <td>pretentious</td>\n",
" <td>improper</td>\n",
" <td>completed</td>\n",
" <td>3</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10365</th>\n",
" <td>usual</td>\n",
" <td>proper</td>\n",
" <td>incomplete</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10366 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" parents has_nurs form children housing finance \\\n",
"0 pretentious very_crit foster 1 less_conv convenient \n",
"1 great_pret very_crit complete 1 critical inconv \n",
"2 usual critical complete 4 less_conv convenient \n",
"3 great_pret critical foster 1 critical convenient \n",
"4 usual proper complete 2 convenient convenient \n",
"... ... ... ... ... ... ... \n",
"10361 pretentious less_proper complete 1 convenient convenient \n",
"10362 usual less_proper incomplete 2 less_conv inconv \n",
"10363 great_pret less_proper foster 4 convenient convenient \n",
"10364 pretentious improper completed 3 less_conv convenient \n",
"10365 usual proper incomplete 1 critical convenient \n",
"\n",
" social health \n",
"0 0 not_recom \n",
"1 1 recommended \n",
"2 0 not_recom \n",
"3 0 not_recom \n",
"4 0 not_recom \n",
"... ... ... \n",
"10361 0 recommended \n",
"10362 0 priority \n",
"10363 0 priority \n",
"10364 0 recommended \n",
"10365 0 not_recom \n",
"\n",
"[10366 rows x 8 columns]"
]
},
"execution_count": 6,
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
@ -244,24 +625,24 @@
"\n",
"features = x_train.columns\n",
"QI = [\"finance\", \"social\", \"health\"]\n",
"categorical_features = [\"parents\", \"has_nurs\", \"form\", \"housing\", \"finance\", \"health\", 'children']\n",
"QI_indexes = [i for i, v in enumerate(features) if v in QI]\n",
"categorical_features_indexes = [i for i, v in enumerate(features) if v in categorical_features]\n",
"anonymizer = Anonymize(100, QI_indexes, categorical_features=categorical_features_indexes)\n",
"\n",
"anonymizer = Anonymize(100, QI, categorical_features=categorical_features)\n",
"anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
"anon\n"
"anon"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 127,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "7585"
"text/plain": [
"7585"
]
},
"execution_count": 7,
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
@ -273,14 +654,16 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 128,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "5766"
"text/plain": [
"3001"
]
},
"execution_count": 8,
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
@ -299,20 +682,20 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 129,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Anonymized model accuracy: 0.9976851851851852\n"
"Anonymized model accuracy: 0.9054783950617284\n"
]
}
],
"source": [
"anon_str = anon.astype(str)\n",
"anon_encoded = OneHotEncoder(sparse=False).fit_transform(anon_str)\n",
"anon_encoded = preprocessor.fit_transform(anon)\n",
"test_encoded = preprocessor.transform(x_test)\n",
"\n",
"anon_model = DecisionTreeClassifier()\n",
"anon_model.fit(anon_encoded, y_train)\n",
@ -332,18 +715,23 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 130,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0\n"
"0.5813235577850666\n"
]
}
],
"source": [
"# training data without attacked feature\n",
"x_train_for_attack = np.delete(train_encoded, attack_feature, 1)\n",
"# only attacked feature\n",
"x_train_feature = train_encoded[:, attack_feature].copy().reshape(-1, 1)\n",
"\n",
"anon_bb_attack = AttributeInferenceBlackBox(anon_art_classifier, attack_feature=attack_feature)\n",
"\n",
"# get original model's predictions\n",
@ -353,7 +741,7 @@
"anon_bb_attack.fit(train_encoded[:attack_train_size])\n",
"\n",
"# get inferred values\n",
"inferred_train_anon_bb = anon_bb_attack.infer(x_train_for_attack[attack_train_size:], anon_x_train_predictions[attack_train_size:], values=values)\n",
"inferred_train_anon_bb = anon_bb_attack.infer(x_train_for_attack[attack_train_size:], pred=anon_x_train_predictions[attack_train_size:], values=values)\n",
"# check accuracy\n",
"train_acc = np.sum(inferred_train_anon_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_anon_bb)\n",
"print(train_acc)"
@ -368,14 +756,14 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 131,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5245996527107852\n"
"0.6857032606598495\n"
]
}
],
@ -399,15 +787,15 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 132,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.49415432579890883, 0.48976438779451525)\n",
"(0.49415432579890883, 0.48976438779451525)\n"
"(0.3353658536585366, 0.22540983606557377)\n",
"(0.3354908306364617, 0.18208430913348947)\n"
]
}
],
@ -444,15 +832,15 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 133,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1.0, 0.019204655674102813)\n",
"(0.9829787234042553, 0.04481086323957323)\n"
"(0.6457357075913777, 0.2002324905550712)\n",
"(0.6384266263237519, 0.12263876780005813)\n"
]
}
],
@ -483,24 +871,26 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 134,
"metadata": {},
"outputs": [],
"source": [
"anonymizer2 = Anonymize(1000, QI_indexes, categorical_features=categorical_features_indexes)\n",
"anonymizer2 = Anonymize(1000, QI, categorical_features=categorical_features)\n",
"anon2 = anonymizer2.anonymize(ArrayDataset(x_train, x_train_predictions))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 135,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "4226"
"text/plain": [
"1727"
]
},
"execution_count": 15,
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
@ -519,20 +909,20 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 136,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Anonymized model accuracy: 0.9930555555555556\n"
"Anonymized model accuracy: 0.8981481481481481\n"
]
}
],
"source": [
"anon2_str = anon2.astype(str)\n",
"anon2_encoded = OneHotEncoder(sparse=False).fit_transform(anon2_str)\n",
"anon2_encoded = preprocessor.fit_transform(anon2)\n",
"test_encoded = preprocessor.transform(x_test)\n",
"\n",
"anon2_model = DecisionTreeClassifier()\n",
"anon2_model.fit(anon2_encoded, y_train)\n",
@ -552,18 +942,23 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 137,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0\n"
"0.546015820953116\n"
]
}
],
"source": [
"# training data without attacked feature\n",
"x_train_for_attack = np.delete(train_encoded, attack_feature, 1)\n",
"# only attacked feature\n",
"x_train_feature = train_encoded[:, attack_feature].copy().reshape(-1, 1)\n",
"\n",
"anon2_bb_attack = AttributeInferenceBlackBox(anon2_art_classifier, attack_feature=attack_feature)\n",
"\n",
"# get original model's predictions\n",
@ -573,7 +968,7 @@
"anon2_bb_attack.fit(train_encoded[:attack_train_size])\n",
"\n",
"# get inferred values\n",
"inferred_train_anon2_bb = anon2_bb_attack.infer(x_train_for_attack[attack_train_size:], anon2_x_train_predictions[attack_train_size:], values=values)\n",
"inferred_train_anon2_bb = anon2_bb_attack.infer(x_train_for_attack[attack_train_size:], pred=anon2_x_train_predictions[attack_train_size:], values=values)\n",
"# check accuracy\n",
"train_acc = np.sum(inferred_train_anon2_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_anon2_bb)\n",
"print(train_acc)"
@ -588,14 +983,14 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 138,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.515820953115956\n"
"0.6680493922438742\n"
]
}
],
@ -612,17 +1007,17 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 139,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.49415432579890883, 0.48976438779451525)\n",
"(0.49415432579890883, 0.48976438779451525)\n",
"(1.0, 0.019204655674102813)\n",
"(1.0, 0.026382153249272552)\n"
"(0.3353658536585366, 0.22540983606557377)\n",
"(0.32242990654205606, 0.16159250585480095)\n",
"(0.6457357075913777, 0.2002324905550712)\n",
"(1, 0.0)\n"
]
}
],
@ -655,26 +1050,27 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 140,
"metadata": {},
"outputs": [],
"source": [
"QI2 = [\"parents\", \"has_nurs\", \"form\", \"children\", \"housing\", \"finance\", \"social\", \"health\"]\n",
"QI2_indexes = [i for i, v in enumerate(features) if v in QI2]\n",
"anonymizer3 = Anonymize(100, QI2_indexes, categorical_features=categorical_features_indexes)\n",
"anonymizer3 = Anonymize(100, QI2, categorical_features=categorical_features)\n",
"anon3 = anonymizer3.anonymize(ArrayDataset(x_train, x_train_predictions))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 141,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": "39"
"text/plain": [
"39"
]
},
"execution_count": 21,
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
@ -686,22 +1082,22 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 142,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Anonymized model accuracy: 0.751929012345679\n",
"BB attack accuracy: 1.0\n",
"WB attack accuracy: 0.5187150299054601\n"
"Anonymized model accuracy: 0.7600308641975309\n",
"BB attack accuracy: 0.5330889446266641\n",
"WB attack accuracy: 0.6680493922438742\n"
]
}
],
"source": [
"anon3_str = anon3.astype(str)\n",
"anon3_encoded = OneHotEncoder(sparse=False).fit_transform(anon3_str)\n",
"anon3_encoded = preprocessor.fit_transform(anon3)\n",
"test_encoded = preprocessor.transform(x_test)\n",
"\n",
"anon3_model = DecisionTreeClassifier()\n",
"anon3_model.fit(anon3_encoded, y_train)\n",
@ -710,6 +1106,11 @@
"\n",
"print('Anonymized model accuracy: ', anon3_model.score(test_encoded, y_test))\n",
"\n",
"# training data without attacked feature\n",
"x_train_for_attack = np.delete(train_encoded, attack_feature, 1)\n",
"# only attacked feature\n",
"x_train_feature = train_encoded[:, attack_feature].copy().reshape(-1, 1)\n",
"\n",
"anon3_bb_attack = AttributeInferenceBlackBox(anon3_art_classifier, attack_feature=attack_feature)\n",
"\n",
"# get original model's predictions\n",
@ -719,7 +1120,7 @@
"anon3_bb_attack.fit(train_encoded[:attack_train_size])\n",
"\n",
"# get inferred values\n",
"inferred_train_anon3_bb = anon3_bb_attack.infer(x_train_for_attack[attack_train_size:], anon3_x_train_predictions[attack_train_size:], values=values)\n",
"inferred_train_anon3_bb = anon3_bb_attack.infer(x_train_for_attack[attack_train_size:], pred=anon3_x_train_predictions[attack_train_size:], values=values)\n",
"# check accuracy\n",
"train_acc = np.sum(inferred_train_anon3_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_anon2_bb)\n",
"print('BB attack accuracy: ', train_acc)\n",
@ -736,17 +1137,17 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 143,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.49415432579890883, 0.48976438779451525)\n",
"(0.49415432579890883, 0.48976438779451525)\n",
"(1.0, 0.019204655674102813)\n",
"(1.0, 0.032201745877788554)\n"
"(0.3353658536585366, 0.22540983606557377)\n",
"(0.344644750795334, 0.19028103044496486)\n",
"(0.6457357075913777, 0.2002324905550712)\n",
"(1, 0.0)\n"
]
}
],
@ -793,4 +1194,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View file

@ -27,7 +27,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 1,
"metadata": {},
"outputs": [
{
@ -42,18 +42,6 @@
" [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n",
" [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" y_train = y_train.astype(np.int)\n",
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" y_test = y_test.astype(np.int)\n"
]
}
],
"source": [
@ -96,24 +84,28 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base model accuracy: 0.8183158282660771\n"
"Base model accuracy: 0.8190528837295007\n"
]
}
],
"source": [
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"\n",
"from apt.utils.datasets import ArrayDataset\n",
"from apt.utils.models import SklearnClassifier, ModelOutputType\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"base_est = DecisionTreeClassifier()\n",
"model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)\n",
"model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)\n",
"model.fit(ArrayDataset(x_train, y_train))\n",
"\n",
"print('Base model accuracy: ', model.score(ArrayDataset(x_test, y_test)))"
@ -129,34 +121,30 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.920665\n",
"Improving accuracy\n",
"feature to remove: 2\n",
"Removed feature: 2, new relative accuracy: 0.935261\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.946776\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.972876\n",
"feature to remove: 1\n",
"Removed feature: 1, new relative accuracy: 0.992835\n",
"Removed feature: 1, new relative accuracy: 0.920026\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.938580\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.987204\n",
"feature to remove: 2\n",
"Removed feature: 2, new relative accuracy: 0.992962\n",
"feature to remove: 3\n",
"Removed feature: 3, new relative accuracy: 1.000000\n",
"Accuracy on minimized data: 0.8231229847996315\n"
"Accuracy on minimized data: 0.8165771297006907\n"
]
}
],
"source": [
"import sys\n",
"import os\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"\n",
"from apt.minimization import GeneralizeToRepresentative\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
@ -169,7 +157,7 @@
"# Don't forget to leave a hold-out set for final validation!\n",
"X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
" test_size = 0.4, random_state = 38)\n",
"x_train_predictions = model.predict(X_generalizer_train)\n",
"x_train_predictions = model.predict(ArrayDataset(X_generalizer_train))\n",
"if x_train_predictions.shape[1] > 1:\n",
" x_train_predictions = np.argmax(x_train_predictions, axis=1)\n",
"minimizer.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
@ -187,14 +175,14 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'ranges': {}, 'categories': {}, 'untouched': ['4', '1', '3', '0', '2']}\n"
"{'ranges': {}, 'categories': {}, 'untouched': ['2', '4', '3', '1', '0']}\n"
]
}
],
@ -214,25 +202,25 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.920665\n",
"Improving accuracy\n",
"feature to remove: 2\n",
"Removed feature: 2, new relative accuracy: 0.935261\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.946776\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.972876\n",
"feature to remove: 1\n",
"Removed feature: 1, new relative accuracy: 0.992835\n",
"Accuracy on minimized data: 0.8192845079072624\n",
"{'ranges': {'3': [569.0, 782.0, 870.0, 870.5, 938.0, 1016.5, 1311.5, 1457.0, 1494.5, 1596.0, 1629.5, 1684.0, 1805.0, 1859.0, 1867.5, 1881.5, 1938.0, 1978.5, 2119.0, 2210.0, 2218.0, 2244.5, 2298.5, 2443.5]}, 'categories': {}, 'untouched': ['2', '1', '0', '4']}\n"
"Removed feature: 1, new relative accuracy: 0.920026\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.938580\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.987204\n",
"feature to remove: 2\n",
"Removed feature: 2, new relative accuracy: 0.992962\n",
"Accuracy on minimized data: 0.8100537221795856\n",
"{'ranges': {'3': [704.0, 782.0, 870.0, 951.0, 1588.0, 1647.5, 1684.0, 1805.0, 1923.0, 2168.5]}, 'categories': {}, 'untouched': ['2', '4', '1', '0']}\n"
]
}
],
@ -276,4 +264,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View file

@ -14,31 +14,33 @@
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this tutorial we will show how to perform data minimization for regression ML models using the minimization module.\n",
"\n",
"We will show you applying data minimization to a different trained regression models."
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "markdown",
"source": [
"## Load data\n",
"QI parameter determines which features will be minimized."
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
"source": [
"## Load data\n",
"QI parameter determines which features will be minimized."
]
},
{
"cell_type": "code",
"execution_count": 54,
"execution_count": 7,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"from sklearn.datasets import load_diabetes\n",
@ -49,27 +51,24 @@
"\n",
"features = ['age', 'sex', 'bmi', 'bp',\n",
" 's1', 's2', 's3', 's4', 's5', 's6']\n",
"QI = [0, 2, 5, 8, 9]"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
"QI = ['age', 'bmi', 's2', 's5', 's6']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train DecisionTreeRegressor model"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": 55,
"execution_count": 8,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -86,27 +85,24 @@
"model1 = DecisionTreeRegressor(random_state=10, min_samples_split=2)\n",
"model1.fit(X_train, y_train)\n",
"print('Base model accuracy (R2 score): ', model1.score(X_test, y_test))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run minimization\n",
"We will try to run minimization with only a subset of the features."
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": 56,
"execution_count": 9,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -121,14 +117,14 @@
"feature to remove: bmi\n",
"Removed feature: bmi, new relative accuracy: 0.718978\n",
"Accuracy on minimized data: 0.11604533946025941\n",
"generalizations: {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 'bmi', 's6', 'bp', 's4', 's5', 'sex', 's1']}\n"
"generalizations: {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 's6', 's5', 'bp', 'bmi', 's4', 's1', 'sex']}\n"
]
}
],
"source": [
"# note that is_regression param is True\n",
"\n",
"minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, features=features, is_regression=True,\n",
"minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, is_regression=True,\n",
" features_to_minimize=QI)\n",
"\n",
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
@ -139,32 +135,40 @@
" test_size = 0.4, random_state = 38)\n",
"\n",
"x_train_predictions1 = model1.predict(X_generalizer_train1)\n",
"minimizer1.fit(X_generalizer_train1, x_train_predictions1)\n",
"transformed1 = minimizer1.transform(x_test1)\n",
"minimizer1.fit(X_generalizer_train1, x_train_predictions1, features_names=features)\n",
"transformed1 = minimizer1.transform(x_test1, features_names=features)\n",
"print('Accuracy on minimized data: ', model1.score(transformed1, y_test1))\n",
"print('generalizations: ',minimizer1.generalizations_)#%% md"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
]
},
{
"cell_type": "markdown",
"source": [
"## Train linear regression model"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
"source": [
"## Train linear regression model"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base model accuracy (R2 score): 0.5080618258593723\n"
]
}
],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"from apt.minimization import GeneralizeToRepresentative\n",
@ -172,49 +176,42 @@
"model2 = LinearRegression()\n",
"model2.fit(X_train, y_train)\n",
"print('Base model accuracy (R2 score): ', model2.score(X_test, y_test))"
],
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run minimization\n",
"We will try to run minimization with only a subset of the features."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Run minimization\n",
"We will try to run minimization with only a subset of the features."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 58,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.225782\n",
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.355377\n",
"Improving accuracy\n",
"feature to remove: age\n",
"Removed feature: age, new relative accuracy: 0.223565\n",
"feature to remove: s2\n",
"Removed feature: s2, new relative accuracy: 0.759788\n",
"Accuracy on minimized data: 0.4414329261774286\n",
"generalizations: {'ranges': {'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.04049498960375786, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, 0.0015758189256303012, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025, 0.06386702693998814], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.0383566590026021, -0.02800139266764745, -0.021788232028484344, -0.007290858076885343, -0.007290857844054699, 0.017561784014105797, 0.02377494378015399, 0.02791705122217536, 0.02998810407007113, 0.054840744473040104]}, 'categories': {}, 'untouched': ['s2', 's3', 'bp', 's4', 'age', 'sex', 's1']}\n"
"Removed feature: s2, new relative accuracy: 0.773233\n",
"Accuracy on minimized data: 0.3945625296515525\n",
"generalizations: {'ranges': {'age': [-0.06181889958679676, -0.027309785597026348, -0.012779631884768605, -0.0036982858437113464, -0.001882016658782959, 0.0035667913034558296, 0.01991321425884962, 0.021729483967646956, 0.02717829099856317, 0.04534098319709301, 0.05805486813187599], 'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.04876246117055416, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, -0.0048803192912600935, 0.0002040128456428647, 0.0015758189256303012, 0.008132445393130183, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.044569820165634155, -0.0383566590026021, -0.021788232028484344, -0.017646125052124262, -0.013504017610102892, 0.02377494378015399, 0.06519601307809353, 0.08383549377322197]}, 'categories': {}, 'untouched': ['s3', 's2', 'bp', 's4', 's1', 'sex']}\n"
]
}
],
"source": [
"# note that is_regression param is True\n",
"\n",
"minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, features=features, is_regression=True,\n",
"minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, is_regression=True,\n",
" features_to_minimize=QI)\n",
"\n",
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
@ -225,17 +222,11 @@
" test_size = 0.4, random_state = 38)\n",
"\n",
"x_train_predictions2 = model2.predict(X_generalizer_train2)\n",
"minimizer2.fit(X_generalizer_train2, x_train_predictions2)\n",
"transformed2 = minimizer2.transform(x_test2)\n",
"minimizer2.fit(X_generalizer_train2, x_train_predictions2, features_names=features)\n",
"transformed2 = minimizer2.transform(x_test2, features_names=features)\n",
"print('Accuracy on minimized data: ', model2.score(transformed2, y_test2))\n",
"print('generalizations: ',minimizer2.generalizations_)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
]
}
],
"metadata": {
@ -247,16 +238,16 @@
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
"nbformat_minor": 1
}

View file

@ -2,37 +2,36 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Applying data minimization with categorical data and only a subset of the features to a trained ML model"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
"\n",
"This will be demonstarted using the German Credit dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data)."
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data\n",
"QI parameter determines which features will be minimized."
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
@ -108,9 +107,13 @@
}
],
"source": [
"from apt.utils import get_german_credit_dataset\n",
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"\n",
"(x_train, y_train), (x_test, y_test) = get_german_credit_dataset()\n",
"from apt.utils.dataset_utils import get_german_credit_dataset_pd\n",
"\n",
"(x_train, y_train), (x_test, y_test) = get_german_credit_dataset_pd()\n",
"features = [\"Existing_checking_account\", \"Duration_in_month\", \"Credit_history\", \"Purpose\", \"Credit_amount\",\n",
" \"Savings_account\", \"Present_employment_since\", \"Installment_rate\", \"Personal_status_sex\", \"debtors\",\n",
" \"Present_residence\", \"Property\", \"Age\", \"Other_installment_plans\", \"Housing\",\n",
@ -123,33 +126,30 @@
" \"Housing\", \"Job\"]\n",
"\n",
"print(x_train)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train decision tree model\n",
"we use OneHotEncoder to handle categorical features."
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base model accuracy: 0.7033333333333334\n"
"Base model accuracy: 0.6933333333333334\n"
]
}
],
@ -176,50 +176,47 @@
"\n",
"encoded_test = preprocessor.transform(x_test)\n",
"print('Base model accuracy: ', model.score(encoded_test, y_test))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run minimization\n",
"We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.805556\n",
"Improving accuracy\n",
"feature to remove: Property\n",
"Removed feature: Property, new relative accuracy: 0.819444\n",
"feature to remove: Other_installment_plans\n",
"Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
"feature to remove: Job\n",
"Removed feature: Job, new relative accuracy: 0.833333\n",
"feature to remove: Housing\n",
"Removed feature: Housing, new relative accuracy: 0.833333\n",
"feature to remove: Purpose\n",
"Removed feature: Purpose, new relative accuracy: 0.916667\n",
"feature to remove: Credit_history\n",
"Removed feature: Credit_history, new relative accuracy: 0.930556\n",
"feature to remove: debtors\n",
"Removed feature: debtors, new relative accuracy: 0.944444\n",
"Removed feature: Credit_history, new relative accuracy: 0.819444\n",
"feature to remove: Other_installment_plans\n",
"Removed feature: Other_installment_plans, new relative accuracy: 0.847222\n",
"feature to remove: Duration_in_month\n",
"Removed feature: Duration_in_month, new relative accuracy: 1.000000\n",
"Removed feature: Duration_in_month, new relative accuracy: 0.847222\n",
"feature to remove: Property\n",
"Removed feature: Property, new relative accuracy: 0.847222\n",
"feature to remove: Housing\n",
"Removed feature: Housing, new relative accuracy: 0.847222\n",
"feature to remove: Purpose\n",
"Removed feature: Purpose, new relative accuracy: 0.986111\n",
"feature to remove: debtors\n",
"Removed feature: debtors, new relative accuracy: 0.986111\n",
"feature to remove: Job\n",
"Removed feature: Job, new relative accuracy: 1.000000\n",
"Accuracy on minimized data: 0.6666666666666666\n"
]
}
@ -233,7 +230,7 @@
"from sklearn.model_selection import train_test_split\n",
"\n",
"# default target_accuracy is 0.998\n",
"minimizer = GeneralizeToRepresentative(model, features=features,\n",
"minimizer = GeneralizeToRepresentative(model, \n",
" categorical_features=categorical_features, features_to_minimize=QI)\n",
"\n",
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
@ -248,117 +245,103 @@
"y_test.reset_index(drop=True, inplace=True)\n",
"encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
"x_train_predictions = model.predict(encoded_generalizer_train)\n",
"minimizer.fit(X_generalizer_train, x_train_predictions)\n",
"transformed = minimizer.transform(x_test)\n",
"minimizer.fit(X_generalizer_train, x_train_predictions, features_names=features)\n",
"transformed = minimizer.transform(x_test, features_names=features)\n",
"\n",
"encoded_transformed = preprocessor.transform(transformed)\n",
"print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Let's see what features were generalized"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'ranges': {}, 'categories': {}, 'untouched': ['Purpose', 'Present_residence', 'Credit_history', 'Telephone', 'Job', 'Housing', 'Installment_rate', 'Number_of_existing_credits', 'Foreign_worker', 'Existing_checking_account', 'Other_installment_plans', 'N_people_being_liable_provide_maintenance', 'Property', 'Savings_account', 'Present_employment_since', 'Personal_status_sex', 'Duration_in_month', 'debtors', 'Credit_amount', 'Age']}\n"
"{'ranges': {}, 'categories': {}, 'untouched': ['Foreign_worker', 'Other_installment_plans', 'Existing_checking_account', 'Purpose', 'debtors', 'Housing', 'N_people_being_liable_provide_maintenance', 'Present_employment_since', 'Installment_rate', 'Credit_history', 'Property', 'Present_residence', 'Age', 'Credit_amount', 'Duration_in_month', 'Job', 'Personal_status_sex', 'Number_of_existing_credits', 'Savings_account', 'Telephone']}\n"
]
}
],
"source": [
"generalizations = minimizer.generalizations\n",
"print(generalizations)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
"\n",
"Let's change to a slightly lower target accuracy."
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.805556\n",
"Improving accuracy\n",
"feature to remove: Property\n",
"Removed feature: Property, new relative accuracy: 0.819444\n",
"feature to remove: Other_installment_plans\n",
"Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
"feature to remove: Job\n",
"Removed feature: Job, new relative accuracy: 0.833333\n",
"feature to remove: Housing\n",
"Removed feature: Housing, new relative accuracy: 0.833333\n",
"feature to remove: Purpose\n",
"Removed feature: Purpose, new relative accuracy: 0.916667\n",
"feature to remove: Credit_history\n",
"Removed feature: Credit_history, new relative accuracy: 0.930556\n",
"Accuracy on minimized data: 0.6416666666666667\n",
"{'ranges': {'Duration_in_month': [7.0, 8.5, 11.0, 13.0, 14.0, 18.0, 23.0, 25.5, 34.5, 47.5]}, 'categories': {'debtors': [['A101', 'A102'], ['A103']]}, 'untouched': ['Existing_checking_account', 'Savings_account', 'Present_employment_since', 'Property', 'Housing', 'Purpose', 'Personal_status_sex', 'Present_residence', 'Credit_history', 'Telephone', 'Installment_rate', 'Other_installment_plans', 'Number_of_existing_credits', 'Credit_amount', 'N_people_being_liable_provide_maintenance', 'Foreign_worker', 'Age', 'Job']}\n"
"Removed feature: Credit_history, new relative accuracy: 0.819444\n",
"feature to remove: Other_installment_plans\n",
"Removed feature: Other_installment_plans, new relative accuracy: 0.847222\n",
"feature to remove: Duration_in_month\n",
"Removed feature: Duration_in_month, new relative accuracy: 0.847222\n",
"feature to remove: Property\n",
"Removed feature: Property, new relative accuracy: 0.847222\n",
"feature to remove: Housing\n",
"Removed feature: Housing, new relative accuracy: 0.847222\n",
"feature to remove: Purpose\n",
"Removed feature: Purpose, new relative accuracy: 0.986111\n",
"Accuracy on minimized data: 0.6666666666666666\n",
"{'ranges': {}, 'categories': {'debtors': [['A103', 'A102'], ['A101']], 'Job': [['A173', 'A174'], ['A171'], ['A172']]}, 'untouched': ['Credit_amount', 'Duration_in_month', 'Credit_history', 'Foreign_worker', 'Housing', 'Other_installment_plans', 'Property', 'N_people_being_liable_provide_maintenance', 'Present_residence', 'Personal_status_sex', 'Telephone', 'Number_of_existing_credits', 'Present_employment_since', 'Existing_checking_account', 'Savings_account', 'Age', 'Purpose', 'Installment_rate']}\n"
]
}
],
"source": [
"# We allow a 1% deviation in accuracy from the original model accuracy\n",
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, features=features,\n",
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, \n",
" categorical_features=categorical_features, features_to_minimize=QI)\n",
"\n",
"minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
"transformed2 = minimizer2.transform(x_test)\n",
"minimizer2.fit(X_generalizer_train, x_train_predictions, features_names=features)\n",
"transformed2 = minimizer2.transform(x_test, features_names=features)\n",
"\n",
"encoded_transformed2 = preprocessor.transform(transformed2)\n",
"print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test))\n",
"generalizations2 = minimizer2.generalizations\n",
"print(generalizations2)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This time we were able to generalize two features (Duration_in_month and debtors)."
],
"metadata": {
"collapsed": false
}
"This time we were able to generalize two features (debtors and Job)."
]
}
],
"metadata": {
@ -370,16 +353,16 @@
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
"nbformat_minor": 1
}

View file

@ -1,13 +1,9 @@
numpy~=1.22.3
pandas~=1.1.3
scipy~=1.5.2
scikit-learn~=1.0.2
adversarial-robustness-toolkit~=1.11.0
numpy~=1.22
pandas~=1.1.0
scipy>=1.4.1
scikit-learn>=0.22.2
torch>=1.8.0
adversarial-robustness-toolbox>=1.11.0
# testing
pytest~=6.1.1
torch~=1.11.0
sklearn~=0.0
six~=1.15.0
shap~=0.40.0
pytest>=5.4.2

View file

@ -1,7 +1,7 @@
[metadata]
# replace with your username:
name = ai-privacy-toolkit
version = 0.0.4
version = 0.1.0
author = Abigail Goldsteen
author_email = abigailt@il.ibm.com
description = A toolkit for tools and techniques related to the privacy and compliance of AI models.

View file

@ -7,14 +7,14 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from apt.anonymization import Anonymize
from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, get_nursery_dataset_pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
from apt.utils.datasets import ArrayDataset
def test_anonymize_ndarray_iris():
(x_train, y_train), _ = get_iris_dataset()
(x_train, y_train), _ = get_iris_dataset_np()
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
@ -31,11 +31,11 @@ def test_anonymize_ndarray_iris():
def test_anonymize_pandas_adult():
(x_train, y_train), _ = get_adult_dataset()
(x_train, y_train), _ = get_adult_dataset_pd()
k = 100
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country']
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
@ -64,8 +64,9 @@ def test_anonymize_pandas_adult():
assert (anon.loc[:, QI].value_counts().min() >= k)
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
def test_anonymize_pandas_nursery():
(x_train, y_train), _ = get_nursery_dataset()
(x_train, y_train), _ = get_nursery_dataset_pd()
x_train = x_train.astype(str)
k = 100
@ -98,7 +99,6 @@ def test_anonymize_pandas_nursery():
def test_regression():
dataset = load_diabetes()
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
@ -126,9 +126,9 @@ def test_errors():
with pytest.raises(ValueError):
Anonymize(2, None)
anonymizer = Anonymize(10, [0, 2])
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
(x_train, y_train), (x_test, y_test) = get_iris_dataset_np()
with pytest.raises(ValueError):
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
(x_train, y_train), _ = get_adult_dataset()
(x_train, y_train), _ = get_adult_dataset_pd()
with pytest.raises(ValueError):
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))

41
tests/test_datasets.py Normal file
View file

@ -0,0 +1,41 @@
import pytest
import numpy as np
from apt.utils.datasets import Data, DatasetWithPredictions
from apt.utils import dataset_utils
def test_dataset_predictions():
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
pred = np.array([[0.23, 0.56, 0.21] for i in range(105)])
dataset = DatasetWithPredictions(pred)
data = Data(train=dataset)
new_pred = data.get_train_set().get_predictions()
assert np.equal(pred, new_pred).all()
def test_dataset_predictions_x():
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
pred = np.array([[0.23, 0.56, 0.21] for i in range(105)])
dataset = DatasetWithPredictions(pred, x=x_train)
data = Data(train=dataset)
new_pred = data.get_train_set().get_predictions()
assert np.equal(pred, new_pred).all()
def test_dataset_predictions_y():
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
pred = np.array([[0.23, 0.56, 0.21] for i in range(105)])
dataset = DatasetWithPredictions(pred, x=x_train, y=y_train)
data = Data(train=dataset)
new_pred = data.get_train_set().get_predictions()
assert np.equal(pred, new_pred).all()

View file

@ -1,6 +1,8 @@
import pytest
import numpy as np
import pandas as pd
from numpy.testing import assert_almost_equal
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_boston, load_diabetes
@ -9,11 +11,15 @@ from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from apt.minimization import GeneralizeToRepresentative
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
from apt.utils.datasets import ArrayDataset
from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor
from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, get_german_credit_dataset_pd
from apt.utils.datasets import ArrayDataset, Data
from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor, KerasClassifier, \
BlackboxClassifierPredictions
@pytest.fixture
@ -39,7 +45,7 @@ def test_minimizer_params(data):
y = [1, 1, 0]
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
gen = GeneralizeToRepresentative(model, cells=cells)
@ -63,39 +69,43 @@ def test_minimizer_fit(data):
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
predictions = model.predict(X)
ad = ArrayDataset(X)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
train_dataset = ArrayDataset(X, predictions, features_names=features)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}
transformed = gen.transform(dataset=ad)
gener = gen.generalizations
expected_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_fit_pandas(data):
features = ['age', 'height', 'sex', 'ola']
@ -131,37 +141,42 @@ def test_minimizer_fit_pandas(data):
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(encoded, y))
predictions = model.predict(encoded)
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['ola', 'height', 'sex']}
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': []}, 'categories': {'sex': [['f', 'm']], 'ola': [['aa', 'bb']]},
'untouched': ['height']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_params_categorical(data):
# Assume three features, age, sex and height, and boolean label
@ -212,19 +227,23 @@ def test_minimizer_params_categorical(data):
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(encoded, y))
predictions = model.predict(encoded)
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, cells=cells)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_fit_QI(data):
features = ['age', 'height', 'weight']
@ -244,38 +263,42 @@ def test_minimizer_fit_QI(data):
QI = ['age', 'weight']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
predictions = model.predict(X)
ad = ArrayDataset(X)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions, features_names=features)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
transformed = gen.transform(dataset=ad)
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_fit_pandas_QI(data):
features = ['age', 'height', 'weight', 'sex', 'ola']
@ -313,85 +336,92 @@ def test_minimizer_fit_pandas_QI(data):
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(encoded, y))
predictions = model.predict(encoded)
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
'untouched': ['height', 'sex']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
# assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
# assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimize_ndarray_iris():
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
(x_train, y_train), (x_test, y_test) = get_iris_dataset_np()
QI = ['sepal length (cm)', 'petal length (cm)']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(x_train, y_train))
predictions = model.predict(x_train)
predictions = model.predict(ArrayDataset(x_train))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features_to_minimize=QI)
target_accuracy = 0.3
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
# gen.fit(dataset=ArrayDataset(x_train, predictions))
transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
gener = gen.generalizations
expected_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
'categories': {}, 'untouched': ['petal width (cm)', 'sepal width (cm)']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x_train, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (x_train[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimize_pandas_adult():
(x_train, y_train), (x_test, y_test) = get_adult_dataset()
(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()
x_train = x_train.head(1000)
y_train = y_train.head(1000)
@ -420,18 +450,18 @@ def test_minimize_pandas_adult():
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(encoded, y_train))
predictions = model.predict(encoded)
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': {
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': {
'workclass': [['Self-emp-not-inc', 'Private', 'Federal-gov', 'Self-emp-inc', '?', 'Local-gov', 'State-gov']],
'marital-status': [
['Divorced', 'Married-AF-spouse', 'Married-spouse-absent', 'Widowed', 'Separated', 'Married-civ-spouse',
@ -445,28 +475,31 @@ def test_minimize_pandas_adult():
['Euro_1', 'LatinAmerica', 'BritishCommonwealth', 'SouthAmerica', 'UnitedStates', 'China', 'Euro_2',
'SE_Asia', 'Other', 'Unknown']]}, 'untouched': ['capital-loss', 'hours-per-week', 'capital-gain']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
# assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
# assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_german_credit_pandas():
(x_train, y_train), (x_test, y_test) = get_german_credit_dataset()
(x_train, y_train), (x_test, y_test) = get_german_credit_dataset_pd()
features = ["Existing_checking_account", "Duration_in_month", "Credit_history", "Purpose", "Credit_amount",
"Savings_account", "Present_employment_since", "Installment_rate", "Personal_status_sex", "debtors",
"Present_residence", "Property", "Age", "Other_installment_plans", "Housing",
@ -493,18 +526,18 @@ def test_german_credit_pandas():
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(encoded, y_train))
predictions = model.predict(encoded)
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions))
transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'Duration_in_month': [31.5]},
gener = gen.generalizations
expected_generalizations = {'ranges': {'Duration_in_month': [31.5]},
'categories': {'Credit_history': [['A30', 'A32', 'A31', 'A34', 'A33']], 'Purpose': [
['A41', 'A46', 'A43', 'A40', 'A44', 'A410', 'A49', 'A45', 'A48', 'A42']],
'debtors': [['A101', 'A102', 'A103']],
@ -518,25 +551,28 @@ def test_german_credit_pandas():
'Age', 'Existing_checking_account', 'Credit_amount',
'Present_employment_since']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
# assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
# assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_regression():
dataset = load_diabetes()
@ -545,20 +581,21 @@ def test_regression():
base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
model = SklearnRegressor(base_est)
model.fit(ArrayDataset(x_train, y_train))
predictions = model.predict(x_train)
predictions = model.predict(ArrayDataset(x_train))
QI = ['age', 'bmi', 's2', 's5']
features = ['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6']
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, is_regression=True,
features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
print('Base model accuracy (R2 score): ', model.score(ArrayDataset(x_test, y_test)))
model.fit(ArrayDataset(transformed, y_train))
print('Base model accuracy (R2 score) after anonymization: ', model.score(ArrayDataset(x_test, y_test)))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {
gener = gen.generalizations
expected_generalizations = {'ranges': {
'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
-0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773,
-0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298,
@ -586,27 +623,30 @@ def test_regression():
0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {},
'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (x_train[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_X_y(data):
features = [0, 1, 2]
@ -626,37 +666,41 @@ def test_X_y(data):
QI = [0, 2]
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
predictions = model.predict(X)
ad = ArrayDataset(X)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
gen.fit(X=X, y=predictions)
transformed = gen.transform(X)
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
gener = gen.generalizations
expected_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']}
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
modified_features = [f for f in features if
str(f) in expexted_generalizations['categories'].keys() or str(f) in expexted_generalizations[
str(f) in expected_generalizations['categories'].keys() or str(f) in expected_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_X_y_features_names(data):
features = ['age', 'height', 'weight']
@ -676,37 +720,41 @@ def test_X_y_features_names(data):
QI = ['age', 'weight']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
predictions = model.predict(X)
ad = ArrayDataset(X)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
gen.fit(X=X, y=predictions, features_names=features)
transformed = gen.transform(X=X, features_names=features)
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_BaseEstimator_classification(data):
features = ['age', 'height', 'weight', 'sex', 'ola']
@ -750,33 +798,37 @@ def test_BaseEstimator_classification(data):
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
'untouched': ['height', 'sex']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
# assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
# assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_BaseEstimator_regression():
dataset = load_diabetes()
@ -789,16 +841,16 @@ def test_BaseEstimator_regression():
QI = ['age', 'bmi', 's2', 's5']
features = ['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6']
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, is_regression=True,
features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
model.fit(transformed, y_train)
print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {
gener = gen.generalizations
expected_generalizations = {'ranges': {
'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
-0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773,
-0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298,
@ -826,23 +878,89 @@ def test_BaseEstimator_regression():
0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {},
'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
ncp = gen.ncp
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (x_train[indexes])).any())
rel_accuracy = model.score(transformed, predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_keras_model():
(X, y), (x_test, y_test) = get_iris_dataset_np()
base_est = Sequential()
base_est.add(Input(shape=(4,)))
base_est.add(Dense(10, activation="relu"))
base_est.add(Dense(3, activation='softmax'))
base_est.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model = KerasClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(x_test)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
test_dataset = ArrayDataset(x_test, predictions)
gen.fit(dataset=test_dataset)
transformed = gen.transform(dataset=ad)
gener = gen.generalizations
features = ['0', '1', '2', '3']
modified_features = [f for f in features if
f in gener['categories'].keys() or f in gener['ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_test, indexes, axis=1)).all())
ncp = gen.ncp
if len(gener['ranges'].keys()) > 0 or len(gener['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_untouched():
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}}, "label": 0,
'categories': {'gender': ['male']}, "representative": {"age": 26, "height": 149}},
{"id": 2, "ranges": {"age": {"start": 39, "end": None}}, "label": 1,
'categories': {'gender': ['female']}, "representative": {"age": 58, "height": 163}},
{"id": 3, "ranges": {"age": {"start": None, "end": 38}}, "label": 0,
'categories': {'gender': ['male']}, "representative": {"age": 31, "height": 184}},
{"id": 4, "ranges": {"age": {"start": 39, "end": None}}, "label": 1,
'categories': {'gender': ['male', 'female']}, "representative": {"age": 45, "height": 176}}
]
gen = GeneralizeToRepresentative(cells=cells)
gen._calculate_generalizations()
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [38, 39]}, 'categories': {}, 'untouched': ['gender']}
for key in expected_generalizations['ranges']:
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expected_generalizations['categories']:
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))

View file

@ -1,21 +1,32 @@
import pytest
import numpy as np
from apt.utils.models import SklearnClassifier, SklearnRegressor, ModelOutputType
from apt.utils.datasets import ArrayDataset
from apt.utils.models import SklearnClassifier, SklearnRegressor, ModelOutputType, KerasClassifier, KerasRegressor, \
BlackboxClassifierPredictions, BlackboxClassifierPredictFunction, is_one_hot, get_nb_classes, XGBoostClassifier
from apt.utils.datasets import ArrayDataset, Data, DatasetWithPredictions
from apt.utils import dataset_utils
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from art.utils import check_and_transform_label_format
from art.utils import to_categorical
def test_sklearn_classifier():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset()
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
underlying_model = RandomForestClassifier()
model = SklearnClassifier(underlying_model, ModelOutputType.CLASSIFIER_VECTOR)
model = SklearnClassifier(underlying_model, ModelOutputType.CLASSIFIER_PROBABILITIES)
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
model.fit(train)
pred = model.predict(x_test)
pred = model.predict(test)
assert(pred.shape[0] == x_test.shape[0])
score = model.score(test)
@ -23,13 +34,296 @@ def test_sklearn_classifier():
def test_sklearn_regressor():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset()
(x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset_np()
underlying_model = DecisionTreeRegressor()
model = SklearnRegressor(underlying_model)
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
model.fit(train)
pred = model.predict(x_test)
pred = model.predict(test)
assert (pred.shape[0] == x_test.shape[0])
score = model.score(test)
def test_keras_classifier():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
underlying_model = Sequential()
underlying_model.add(Input(shape=(4,)))
underlying_model.add(Dense(100, activation="relu"))
underlying_model.add(Dense(10, activation="relu"))
underlying_model.add(Dense(3, activation='softmax'))
underlying_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model = KerasClassifier(underlying_model, ModelOutputType.CLASSIFIER_PROBABILITIES)
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
model.fit(train)
pred = model.predict(test)
assert(pred.shape[0] == x_test.shape[0])
score = model.score(test)
assert(0.0 <= score <= 1.0)
def test_keras_regressor():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset_np()
underlying_model = Sequential()
underlying_model.add(Input(shape=(10,)))
underlying_model.add(Dense(100, activation="relu"))
underlying_model.add(Dense(10, activation="relu"))
underlying_model.add(Dense(1))
underlying_model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])
model = KerasRegressor(underlying_model)
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
model.fit(train)
pred = model.predict(test)
assert (pred.shape[0] == x_test.shape[0])
score = model.score(test)
def test_xgboost_classifier():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
underlying_model = XGBClassifier()
underlying_model.fit(x_train, y_train)
model = XGBoostClassifier(underlying_model, ModelOutputType.CLASSIFIER_PROBABILITIES, input_shape=(4,), nb_classes=3)
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
pred = model.predict(test)
assert(pred.shape[0] == x_test.shape[0])
score = model.score(test)
assert(0.0 <= score <= 1.0)
model.fit(train)
def test_blackbox_classifier():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
data = Data(train, test)
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
pred = model.predict(test)
assert(pred.shape[0] == x_test.shape[0])
score = model.score(test)
assert(score == 1.0)
assert model.model_type is None
def test_blackbox_classifier_predictions():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
train = DatasetWithPredictions(y_train, x_train)
test = DatasetWithPredictions(y_test, x_test)
data = Data(train, test)
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
pred = model.predict(test)
assert(pred.shape[0] == x_test.shape[0])
assert model.model_type is None
with pytest.raises(ValueError):
model.score(test)
def test_blackbox_classifier_predictions_y():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
train = DatasetWithPredictions(y_train, x_train, y_train)
test = DatasetWithPredictions(y_test, x_test, y_test)
data = Data(train, test)
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
pred = model.predict(test)
assert(pred.shape[0] == x_test.shape[0])
score = model.score(test)
assert(score == 1.0)
assert model.model_type is None
def test_blackbox_classifier_mismatch():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
data = Data(train, test)
with pytest.raises(ValueError):
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
def test_blackbox_classifier_no_test():
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
train = ArrayDataset(x_train, y_train)
data = Data(train)
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
pred = model.predict(train)
assert(pred.shape[0] == x_train.shape[0])
score = model.score(train)
assert (score == 1.0)
predictions_x, predictions_y = model.get_predictions()
assert np.array_equal(predictions_x, x_train)
assert np.array_equal(predictions_y, check_and_transform_label_format(y_train, nb_classes=3))
def test_blackbox_classifier_no_train():
(_, _), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
test = ArrayDataset(x_test, y_test)
data = Data(test=test)
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
pred = model.predict(test)
assert(pred.shape[0] == x_test.shape[0])
score = model.score(test)
assert (score == 1.0)
predictions_x, predictions_y = model.get_predictions()
assert np.array_equal(predictions_x, x_test)
assert np.array_equal(predictions_y, check_and_transform_label_format(y_test, nb_classes=3))
def test_blackbox_classifier_no_test_y():
(x_train, y_train), (x_test, _) = dataset_utils.get_iris_dataset_np()
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test)
data = Data(train, test)
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
pred = model.predict(train)
assert(pred.shape[0] == x_train.shape[0])
score = model.score(train)
assert (score == 1.0)
# since no test_y, BBC should use only test thus predict test should fail
unable_to_predict_test = False
try:
model.predict(test)
except BaseException:
unable_to_predict_test = True
assert (unable_to_predict_test, True)
def test_blackbox_classifier_no_train_y():
(x_train, _), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
train = ArrayDataset(x_train)
test = ArrayDataset(x_test, y_test)
data = Data(train, test)
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
pred = model.predict(test)
assert (pred.shape[0] == x_test.shape[0])
score = model.score(test)
assert (score == 1.0)
# since no train_y, BBC should use only test thus predict train should fail
unable_to_predict_train = False
try:
model.predict(train)
except BaseException:
unable_to_predict_train = True
assert(unable_to_predict_train,True)
def test_blackbox_classifier_probabilities():
(x_train, _), (_, _) = dataset_utils.get_iris_dataset_np()
y_train = np.array([[0.23, 0.56, 0.21] for i in range(105)])
train = ArrayDataset(x_train, y_train)
data = Data(train)
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
pred = model.predict(train)
assert (pred.shape[0] == x_train.shape[0])
assert (0.0 < pred).all()
assert (pred < 1.0).all()
score = model.score(train)
assert (score == 1.0)
def test_blackbox_classifier_predict():
def predict(x):
return np.array([[0.23, 0.56, 0.21] for i in range(x.shape[0])])
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
y_train = np.array([[0.23, 0.56, 0.21] for i in range(105)])
train = ArrayDataset(x_train, y_train)
model = BlackboxClassifierPredictFunction(predict, ModelOutputType.CLASSIFIER_PROBABILITIES, (4,), 3)
pred = model.predict(train)
assert (pred.shape[0] == x_train.shape[0])
assert (0.0 < pred).all()
assert (pred < 1.0).all()
score = model.score(train)
assert (score == 1.0)
def test_blackbox_classifier_predict_scalar():
def predict(x):
return np.array([[1.0] for i in range(x.shape[0])])
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
y_train = np.array([[0, 1, 0] for i in range(105)])
train = ArrayDataset(x_train, y_train)
model = BlackboxClassifierPredictFunction(predict, ModelOutputType.CLASSIFIER_SCALAR, (4,), 3)
pred = model.predict(train)
assert (pred.shape[0] == x_train.shape[0])
score = model.score(train)
assert (score == 1.0)
def test_is_one_hot():
(_, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
assert (not is_one_hot(y_train))
assert (not is_one_hot(y_train.reshape(-1,1)))
assert (is_one_hot(to_categorical(y_train)))
def test_get_nb_classes():
(_, y_train), (_, y_test) = dataset_utils.get_iris_dataset_np()
# shape: (x,) - not 1-hot
nb_classes_test = get_nb_classes(y_test)
nb_classes_train = get_nb_classes(y_train)
assert (nb_classes_test == nb_classes_train)
assert (nb_classes_test == 3)
# shape: (x,1) - not 1-hot
nb_classes_test = get_nb_classes(y_test.reshape(-1,1))
assert (nb_classes_test == 3)
# shape: (x,3) - 1-hot
y = to_categorical(y_test)
nb_classes = get_nb_classes(y)
assert (nb_classes == 3)
# gaps: 1,2,4 (0,3 missing)
y_test[y_test == 0] = 4
nb_classes = get_nb_classes(y_test)
assert (nb_classes == 5)