Wrapper minimizer (#20)

* apply dataset wrapper on minimizer
* apply changes on minimization notebook
* add black_box_access and unlimited_queries params
This commit is contained in:
Ola Saadi 2022-04-18 13:14:49 +03:00 committed by GitHub Enterprise
parent 6b04fd5564
commit ac5d82aab6
6 changed files with 583 additions and 215 deletions

View file

@ -1,7 +1,7 @@
"""
This module implements all classes needed to perform data minimization
"""
from typing import Union
from typing import Union, Optional
import pandas as pd
import numpy as np
import copy
@ -16,37 +16,32 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from apt.utils.datasets import ArrayDataset, Data, DATA_PANDAS_NUMPY_TYPE
from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnClassifier
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
""" A transformer that generalizes data to representative points.
Learns data generalizations based on an original model's predictions
and a target accuracy. Once the generalizations are learned, can
receive one or more data records and transform them to representative
points based on the learned generalization.
An alternative way to use the transformer is to supply ``cells`` and
``features`` in init or set_params and those will be used to transform
An alternative way to use the transformer is to supply ``cells`` in
init or set_params and those will be used to transform
data to representatives. In this case, fit must still be called but
there is no need to supply it with ``X`` and ``y``, and there is no
need to supply an existing ``estimator`` to init.
In summary, either ``estimator`` and ``target_accuracy`` should be
supplied or ``cells`` and ``features`` should be supplied.
supplied or ``cells`` should be supplied.
Parameters
----------
estimator : estimator, optional
The original model for which generalization is being performed.
Should be pre-fitted.
target_accuracy : float, optional
The required accuracy when applying the base model to the
generalized data. Accuracy is measured relative to the original
accuracy of the model.
features : list of str, optional
The feature names, in the order that they appear in the data.
categorical_features: list of str, optional
The list of categorical features should only be supplied when
passing data as a pandas dataframe.
@ -63,31 +58,33 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
The required method to train data set for minimizing. Default is
to train the tree just on the features that are given as
features_to_minimize.
Attributes
----------
features_ : list of str
The feature names, in the order that they appear in the data.
cells_ : list of object
The cells used to generalize records, as learned when calling fit.
ncp_ : float
The NCP (information loss) score of the resulting generalization,
as measured on the training data.
generalizations_ : object
The generalizations that were learned (actual feature ranges).
Notes
-----
"""
def __init__(self, estimator=None, target_accuracy=0.998, features=None,
cells=None, categorical_features=None, features_to_minimize: Union[np.ndarray, list] = None
, train_only_QI=True, is_regression=False):
self.estimator = estimator
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998,
cells: list = None, categorical_features: Union[np.ndarray, list] = None,
features_to_minimize: Union[np.ndarray, list] = None, train_only_QI: bool = True,
is_regression: bool = False):
if issubclass(estimator.__class__, Model):
self.estimator = estimator
else:
if is_regression:
self.estimator = SklearnRegressor(estimator)
else:
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_VECTOR)
self.target_accuracy = target_accuracy
self.features = features
self.cells = cells
self.categorical_features = []
if categorical_features:
@ -98,13 +95,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def get_params(self, deep=True):
"""Get parameters for this estimator.
Parameters
----------
deep : boolean, optional
If True, will return the parameters for this estimator and contained
subobjects that are estimators.
Returns
-------
params : mapping of string to any
@ -113,17 +108,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
ret = {}
ret['target_accuracy'] = self.target_accuracy
if deep:
ret['features'] = copy.deepcopy(self.features)
ret['cells'] = copy.deepcopy(self.cells)
ret['estimator'] = self.estimator
else:
ret['features'] = copy.copy(self.features)
ret['cells'] = copy.copy(self.cells)
return ret
def set_params(self, **params):
"""Set the parameters of this estimator.
Returns
-------
self : object
@ -131,8 +123,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
"""
if 'target_accuracy' in params:
self.target_accuracy = params['target_accuracy']
if 'features' in params:
self.features = params['features']
if 'cells' in params:
self.cells = params['cells']
return self
@ -141,9 +131,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def generalizations(self):
return self.generalizations_
def fit_transform(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional = None, dataset: Optional[ArrayDataset] = None):
"""Learns the generalizations based on training data, and applies them to the data.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -151,19 +141,23 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
y : array-like, shape (n_samples,), optional
The target values. An array of int.
This should contain the predictions of the original model on ``X``.
features_names : list of str, The feature names, in the order that they appear in the data,
provided just if X and y were provided (optional).
dataset : Data wrapper containing the training input samples and the predictions of the
original model on the training data.
Either X,y OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
"""
self.fit(X, y)
return self.transform(X)
self.fit(X, y, features_names, dataset=dataset)
return self.transform(X, features_names, dataset=dataset)
def fit(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional = None, dataset: ArrayDataset = None):
"""Learns the generalizations based on training data.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -171,7 +165,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
y : array-like, shape (n_samples,), optional
The target values. An array of int.
This should contain the predictions of the original model on ``X``.
features_names : list of str, The feature names, in the order that they appear in the data,
provided just if X and y were provided (optional).
dataset : Data wrapper containing the training input samples and the predictions of the
original model on the training data.
Either X,y OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
@ -180,26 +178,25 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
"""
# take into account that estimator, X, y, cells, features may be None
if X is not None:
if type(X) == np.ndarray:
self.is_numpy = True
else:
self.is_numpy = False
if X is not None and y is not None:
if self.is_numpy:
X, y = check_X_y(X, y, accept_sparse=True)
self.n_features_ = X.shape[1]
elif self.features:
self.n_features_ = len(self.features)
if dataset is not None:
raise ValueError('Either X,y OR dataset need to be provided, not both')
else:
dataset = ArrayDataset(X, y, features_names)
if dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
self.n_features_ = dataset.get_samples().shape[1]
elif dataset and dataset.features_names:
self.n_features_ = len(dataset.features_names)
else:
self.n_features_ = 0
if self.features:
self._features = self.features
if dataset and dataset.features_names:
self._features = dataset.features_names
# if features is None, use numbers instead of names
elif self.n_features_ != 0:
self._features = [i for i in range(self.n_features_)]
self._features = [str(i) for i in range(self.n_features_)]
else:
self._features = None
@ -211,27 +208,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# Going to fit
# (currently not dealing with option to fit with only X and y and no estimator)
if self.estimator and X is not None and y is not None:
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
if not self.features_to_minimize:
self.features_to_minimize = self._features
self.features_to_minimize = [str(i) for i in self.features_to_minimize]
if not all(elem in self._features for elem in self.features_to_minimize):
raise ValueError('features to minimize should be a subset of features names')
x_QI = x.loc[:, self.features_to_minimize]
if self.is_numpy:
if not self.features_to_minimize:
self.features_to_minimize = [i for i in range(len(self._features))]
x_QI = X[:, self.features_to_minimize]
self.features_to_minimize = [self._features[i] for i in self.features_to_minimize]
X = pd.DataFrame(X, columns=self._features)
else:
if not self.features_to_minimize:
self.features_to_minimize = self._features
x_QI = X.loc[:, self.features_to_minimize]
x_QI = pd.DataFrame(x_QI, columns=self.features_to_minimize)
# divide dataset into train and test
used_data = X
used_data = x
if self.train_only_QI:
used_data = x_QI
if self.is_regression:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=14)
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4, random_state=14)
else:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=18)
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), stratify=dataset.get_labels(), test_size=0.4,
random_state=18)
X_train_QI = X_train.loc[:, self.features_to_minimize]
X_test_QI = X_test.loc[:, self.features_to_minimize]
@ -245,7 +239,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
for feature in self._features:
if feature not in feature_data.keys():
fd = {}
values = list(X.loc[:, feature])
values = list(x.loc[:, feature])
if feature not in self.categorical_features:
fd['min'] = min(values)
fd['max'] = max(values)
@ -258,7 +252,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
categorical_features = [f for f in self._features if f in self.categorical_features and
f in self.features_to_minimize]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
@ -287,7 +280,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
("cat", categorical_transformer, self.categorical_features),
]
)
preprocessor.fit(X)
preprocessor.fit(x)
x_prepared = preprocessor.transform(X_train)
if self.train_only_QI:
x_prepared = preprocessor_QI_features.transform(X_train_QI)
@ -299,7 +292,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.dt_ = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
else:
self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
min_samples_leaf=1)
self.dt_.fit(x_prepared, y_train)
self._modify_categorical_features(used_data)
@ -328,7 +321,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
# check accuracy
accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
print('Initial accuracy of model on generalized data, relative to original model predictions '
'(base generalization derived from tree, before improvements): %f' % accuracy)
@ -348,7 +341,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_,
self.cells_by_id_)
accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
# if accuracy passed threshold roll back to previous iteration generalizations
if accuracy < self.target_accuracy:
self.cells_ = cells_previous_iter
@ -374,7 +367,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self.cells_ currently holds the chosen generalization based on target accuracy
@ -385,14 +378,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# Return the transformer
return self
def transform(self, X: Union[np.ndarray, pd.DataFrame]):
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None):
""" Transforms data records to representative points.
Parameters
----------
X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
may contain both numeric and categorical data.
The input samples.
features_names : list of str, The feature names, in the order that they appear in the data,
provided just if X was provided (optional).
dataset : Data wrapper containing the training input samples.
Either X OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
@ -404,26 +400,30 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
msg = 'This %(name)s instance is not initialized yet. ' \
'Call fit or set_params with ' \
'appropriate arguments before using this method.'
check_is_fitted(self, ['cells', 'features'], msg=msg)
check_is_fitted(self, ['cells'], msg=msg)
if type(X) == np.ndarray:
# Input validation
X = check_array(X, accept_sparse=True)
self.is_numpy = True
X = pd.DataFrame(X, columns=self._features)
else:
self.is_numpy = False
if X is not None:
if dataset is not None:
raise ValueError('Either X OR dataset need to be provided, not both')
else:
dataset = ArrayDataset(X, features_names=features_names)
elif dataset is None:
raise ValueError('Either X OR dataset need to be provided, not both')
if dataset and dataset.features_names:
self._features = dataset.features_names
if dataset and dataset.get_samples() is not None:
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
if X.shape[1] != self.n_features_ and self.n_features_ != 0:
if x.shape[1] != self.n_features_ and self.n_features_ != 0:
raise ValueError('Shape of input is different from what was seen'
'in `fit`')
if not self._features:
self._features = [i for i in range(X.shape[1])]
self._features = [i for i in range(x.shape[1])]
representatives = pd.DataFrame(columns=self._features) # only columns
generalized = pd.DataFrame(X, columns=self._features, copy=True) # original data
mapped = np.zeros(X.shape[0]) # to mark records we already mapped
generalized = pd.DataFrame(x, columns=self._features, copy=True) # original data
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
# iterate over cells (leaves in decision tree)
for i in range(len(self.cells_)):
@ -442,7 +442,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
representatives = representatives.drop(feature, axis=1)
# get the indexes of all records that map to this cell
indexes = self._get_record_indexes_for_cell(X, self.cells_[i], mapped)
indexes = self._get_record_indexes_for_cell(x, self.cells_[i], mapped)
# replace the values in the representative columns with the representative
# values (leaves others untouched)
@ -453,9 +453,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
replace.index = indexes
generalized.loc[indexes, representatives.columns] = replace
if self.is_numpy:
return generalized.to_numpy()
return generalized
if dataset and dataset.is_pandas:
return generalized
elif isinstance(X, pd.DataFrame):
return generalized
return generalized.to_numpy()
def _get_record_indexes_for_cell(self, X, cell, mapped):
indexes = []
@ -639,7 +641,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# else: nothing to do, stay with previous cells
def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
new_cell['hist'] = [x + y for x, y in
zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1
def _get_nodes_level(self, level):
@ -796,8 +799,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cells_by_id = copy.deepcopy(self.cells_by_id_)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
labels) - current_accuracy
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
labels)) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
@ -819,8 +822,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cells_by_id = copy.deepcopy(self.cells_by_id_)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
labels) - current_accuracy
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
labels)) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0

View file

@ -180,7 +180,7 @@ class ArrayDataset(Dataset):
if self.is_pandas:
if features_names and not np.array_equal(features_names, x.columns):
raise ValueError("The supplied features are not the same as in the data features")
self.features_names = x.columns
self.features_names = x.columns.to_list()
if y is not None and len(self._x) != len(self._y):
raise ValueError('Non equivalent lengths of x and y')

View file

@ -1,5 +1,5 @@
from abc import ABCMeta, abstractmethod
from typing import Any
from typing import Any, Optional
from enum import Enum, auto
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
@ -8,7 +8,7 @@ from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
class ModelOutputType(Enum):
CLASSIFIER_VECTOR = auto() # probabilities or logits
CLASSIFIER_SCALAR = auto() # label only
REGRESSOR_SCALAR = auto() # value
REGRESSOR_SCALAR = auto() # value
class Model(metaclass=ABCMeta):
@ -16,16 +16,26 @@ class Model(metaclass=ABCMeta):
Abstract base class for ML model wrappers.
"""
def __init__(self, model: Any, output_type: ModelOutputType, **kwargs):
def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `Model` wrapper object.
:param model: The original model object (of the underlying ML framework)
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
self._model = model
self._output_type = output_type
self._black_box_access = black_box_access
self._unlimited_queries = unlimited_queries
@abstractmethod
def fit(self, train_data: Dataset, **kwargs) -> None:
@ -48,6 +58,16 @@ class Model(metaclass=ABCMeta):
"""
raise NotImplementedError
@abstractmethod
def score(self, test_data: Dataset, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
"""
return NotImplementedError
@property
def model(self) -> Any:
"""
@ -65,3 +85,25 @@ class Model(metaclass=ABCMeta):
:return: The model's output type.
"""
return self._output_type
@property
def black_box_access(self) -> Any:
"""
Return True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals are also available.
:return: True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals are also available.
"""
return self._black_box_access
@property
def unlimited_queries(self) -> Any:
"""
If black_box_access is True, Return whether a user can perform unlimited queries to the model API
or whether there is a limit to the number of queries that can be submitted.
:return: If black_box_access is True, Return whether a user can perform unlimited queries to the model API
or whether there is a limit to the number of queries that can be submitted.
"""
return self._unlimited_queries

View file

@ -1,3 +1,5 @@
from typing import Optional
import numpy as np
from sklearn.preprocessing import OneHotEncoder
@ -28,13 +30,23 @@ class SklearnClassifier(SklearnModel):
"""
Wrapper class for scikitlearn classification models.
"""
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, **kwargs):
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `SklearnClassifier` wrapper object.
:param model: The original sklearn model object
:param model: The original sklearn model object.
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
super().__init__(model, output_type, **kwargs)
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
self._art_model = ArtSklearnClassifier(model)
def fit(self, train_data: Dataset, **kwargs) -> None:
@ -63,13 +75,21 @@ class SklearnRegressor(SklearnModel):
"""
Wrapper class for scikitlearn regression models.
"""
def __init__(self, model: BaseEstimator, **kwargs):
def __init__(self, model: BaseEstimator, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `SklearnRegressor` wrapper object.
:param model: The original sklearn model object
:param model: The original sklearn model object.
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, **kwargs)
super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
self._art_model = ScikitlearnRegressor(model)
def fit(self, train_data: Dataset, **kwargs) -> None:

View file

@ -27,7 +27,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 15,
"metadata": {},
"outputs": [
{
@ -42,6 +42,18 @@
" [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n",
" [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" y_train = y_train.astype(np.int)\n",
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" y_test = y_test.astype(np.int)\n"
]
}
],
"source": [
@ -84,24 +96,27 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base model accuracy: 0.8189914624408821\n"
"Base model accuracy: 0.8183158282660771\n"
]
}
],
"source": [
"from apt.utils.datasets import ArrayDataset\n",
"from apt.utils.models import SklearnClassifier, ModelOutputType\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"model = DecisionTreeClassifier()\n",
"model.fit(x_train, y_train)\n",
"base_est = DecisionTreeClassifier()\n",
"model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)\n",
"model.fit(ArrayDataset(x_train, y_train))\n",
"\n",
"print('Base model accuracy: ', model.score(x_test, y_test))"
"print('Base model accuracy: ', model.score(ArrayDataset(x_test, y_test)))"
]
},
{
@ -114,26 +129,26 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
"Improving accuracy\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.939867\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.967247\n",
"feature to remove: 2\n",
"Removed feature: 2, new relative accuracy: 0.972620\n",
"Removed feature: 2, new relative accuracy: 0.935261\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.946776\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.972876\n",
"feature to remove: 1\n",
"Removed feature: 1, new relative accuracy: 0.992323\n",
"Removed feature: 1, new relative accuracy: 0.992835\n",
"feature to remove: 3\n",
"Removed feature: 3, new relative accuracy: 1.000000\n",
"Accuracy on minimized data: 0.8237371411024106\n"
"Accuracy on minimized data: 0.8231229847996315\n"
]
}
],
@ -155,10 +170,12 @@
"X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
" test_size = 0.4, random_state = 38)\n",
"x_train_predictions = model.predict(X_generalizer_train)\n",
"minimizer.fit(X_generalizer_train, x_train_predictions)\n",
"transformed = minimizer.transform(x_test)\n",
"if x_train_predictions.shape[1] > 1:\n",
" x_train_predictions = np.argmax(x_train_predictions, axis=1)\n",
"minimizer.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
"transformed = minimizer.transform(dataset=ArrayDataset(x_test))\n",
"\n",
"print('Accuracy on minimized data: ', model.score(transformed, y_test))"
"print('Accuracy on minimized data: ', model.score(ArrayDataset(transformed, y_test)))"
]
},
{
@ -170,14 +187,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'ranges': {}, 'untouched': [0, 1, 2, 3, 4]}\n"
"{'ranges': {}, 'categories': {}, 'untouched': ['4', '1', '3', '0', '2']}\n"
]
}
],
@ -197,25 +214,25 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
"Improving accuracy\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.939867\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.967247\n",
"feature to remove: 2\n",
"Removed feature: 2, new relative accuracy: 0.972620\n",
"Removed feature: 2, new relative accuracy: 0.935261\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.946776\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.972876\n",
"feature to remove: 1\n",
"Removed feature: 1, new relative accuracy: 0.992323\n",
"Accuracy on minimized data: 0.820205742361431\n",
"{'ranges': {3: [546.0, 704.0, 705.5, 742.5, 782.0, 834.0, 870.0, 1446.5, 1538.5, 1612.5, 1699.0, 1744.0, 1801.0, 1814.0, 1846.0, 1881.5, 1978.5, 2248.0, 2298.5, 2537.5]}, 'untouched': [0, 1, 2, 4]}\n"
"Removed feature: 1, new relative accuracy: 0.992835\n",
"Accuracy on minimized data: 0.8192845079072624\n",
"{'ranges': {'3': [569.0, 782.0, 870.0, 870.5, 938.0, 1016.5, 1311.5, 1457.0, 1494.5, 1596.0, 1629.5, 1684.0, 1805.0, 1859.0, 1867.5, 1881.5, 1938.0, 1978.5, 2119.0, 2210.0, 2218.0, 2244.5, 2298.5, 2443.5]}, 'categories': {}, 'untouched': ['2', '1', '0', '4']}\n"
]
}
],
@ -223,9 +240,9 @@
"# We allow a 1% deviation in accuracy from the original model accuracy\n",
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.99)\n",
"\n",
"minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
"transformed2 = minimizer2.transform(x_test)\n",
"print('Accuracy on minimized data: ', model.score(transformed2, y_test))\n",
"minimizer2.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
"transformed2 = minimizer2.transform(dataset=ArrayDataset(x_test))\n",
"print('Accuracy on minimized data: ', model.score(test_data=ArrayDataset(transformed2, y_test)))\n",
"generalizations2 = minimizer2.generalizations\n",
"print(generalizations2)"
]
@ -259,4 +276,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View file

@ -5,14 +5,15 @@ from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_boston, load_diabetes
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from apt.minimization import GeneralizeToRepresentative
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
from apt.utils.datasets import ArrayDataset
from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor
@pytest.fixture
@ -38,11 +39,12 @@ def test_minimizer_params(data):
y = [1, 1, 0]
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(X, y)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(X, y))
gen = GeneralizeToRepresentative(base_est, features=features, cells=cells)
gen = GeneralizeToRepresentative(model, cells=cells)
gen.fit()
transformed = gen.transform(X)
transformed = gen.transform(dataset=ArrayDataset(X, features_names=features))
def test_minimizer_fit(data):
@ -58,15 +60,20 @@ def test_minimizer_fit(data):
[69, 175],
[24, 181],
[18, 190]])
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(X, y)
predictions = base_est.predict(X)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(X, y))
predictions = model.predict(X)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5)
gen.fit(X, predictions)
transformed = gen.transform(X)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
train_dataset = ArrayDataset(X, predictions, features_names=features)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}
@ -103,7 +110,7 @@ def test_minimizer_fit_pandas(data):
[69, 175, 'm', 'aa'],
[24, 181, 'm', 'bb'],
[18, 190, 'm', 'bb']]
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
numeric_features = ["age", "height"]
@ -121,16 +128,22 @@ def test_minimizer_fit_pandas(data):
]
)
encoded = preprocessor.fit_transform(X)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(encoded, y)
predictions = base_est.predict(encoded)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(encoded, y))
predictions = model.predict(encoded)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
categorical_features=categorical_features)
gen.fit(X, predictions)
transformed = gen.transform(X)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['ola', 'height', 'sex']}
@ -143,7 +156,7 @@ def test_minimizer_fit_pandas(data):
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
@ -179,7 +192,7 @@ def test_minimizer_params_categorical(data):
[24, 181, 'm'],
[18, 190, 'm']]
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
numeric_features = ["age", "height"]
numeric_transformer = Pipeline(
@ -196,16 +209,21 @@ def test_minimizer_params_categorical(data):
]
)
encoded = preprocessor.fit_transform(X)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(encoded, y)
predictions = base_est.predict(encoded)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(encoded, y))
predictions = model.predict(encoded)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
categorical_features=categorical_features, cells=cells)
gen.fit(X, predictions)
transformed = gen.transform(X)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
def test_minimizer_fit_QI(data):
@ -222,16 +240,20 @@ def test_minimizer_fit_QI(data):
[24, 181, 95],
[18, 190, 102]])
print(X)
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
QI = [0, 2]
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
QI = ['age', 'weight']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(X, y)
predictions = base_est.predict(X)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(X, y))
predictions = model.predict(X)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, features_to_minimize=QI)
gen.fit(X, predictions)
transformed = gen.transform(X)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions, features_names=features)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
for key in expexted_generalizations['ranges']:
@ -240,7 +262,7 @@ def test_minimizer_fit_QI(data):
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, QI, axis=1) == np.delete(X, QI, axis=1)).all())
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
@ -269,7 +291,7 @@ def test_minimizer_fit_pandas_QI(data):
[24, 181, 49, 'm', 'bb'],
[18, 190, 69, 'm', 'bb']]
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
QI = ['age', 'weight', 'ola']
@ -288,16 +310,22 @@ def test_minimizer_fit_pandas_QI(data):
]
)
encoded = preprocessor.fit_transform(X)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(encoded, y)
predictions = base_est.predict(encoded)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(encoded, y))
predictions = model.predict(encoded)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(X, predictions)
transformed = gen.transform(X)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
'untouched': ['height', 'sex']}
@ -308,12 +336,13 @@ def test_minimizer_fit_pandas_QI(data):
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
# assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
# assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
@ -322,16 +351,19 @@ def test_minimizer_fit_pandas_QI(data):
def test_minimize_ndarray_iris():
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
(x_train, y_train), _ = get_iris_dataset()
QI = [0, 2]
model = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model.fit(x_train, y_train)
pred = model.predict(x_train)
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
QI = ['sepal length (cm)', 'petal length (cm)']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(x_train, y_train))
predictions = model.predict(x_train)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features=features, features_to_minimize=QI)
gen.fit(x_train, pred)
transformed = gen.transform(x_train)
gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features_to_minimize=QI)
# gen.fit(dataset=ArrayDataset(x_train, predictions))
transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
'categories': {}, 'untouched': ['petal width (cm)', 'sepal width (cm)']}
@ -342,7 +374,7 @@ def test_minimize_ndarray_iris():
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x_train, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
@ -359,12 +391,13 @@ def test_minimize_ndarray_iris():
def test_minimize_pandas_adult():
(x_train, y_train), _ = get_adult_dataset()
(x_train, y_train), (x_test, y_test) = get_adult_dataset()
x_train = x_train.head(1000)
y_train = y_train.head(1000)
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
x_train = pd.DataFrame(x_train, columns=features)
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'hours-per-week', 'native-country']
@ -384,15 +417,19 @@ def test_minimize_pandas_adult():
]
)
encoded = preprocessor.fit_transform(x_train)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(encoded, y_train)
predictions = base_est.predict(encoded)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(encoded, y_train))
predictions = model.predict(encoded)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(x_train, predictions)
transformed = gen.transform(x_train)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': {
'workclass': [['Self-emp-not-inc', 'Private', 'Federal-gov', 'Self-emp-inc', '?', 'Local-gov', 'State-gov']],
@ -414,12 +451,14 @@ def test_minimize_pandas_adult():
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
# assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
# assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
@ -451,15 +490,19 @@ def test_german_credit_pandas():
]
)
encoded = preprocessor.fit_transform(x_train)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(encoded, y_train)
predictions = base_est.predict(encoded)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(encoded, y_train))
predictions = model.predict(encoded)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(x_train, predictions)
transformed = gen.transform(x_train)
gen.fit(dataset=ArrayDataset(x_train, predictions))
transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'Duration_in_month': [31.5]},
'categories': {'Credit_history': [['A30', 'A32', 'A31', 'A34', 'A33']], 'Purpose': [
@ -481,12 +524,14 @@ def test_german_credit_pandas():
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
# assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
# assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
@ -497,17 +542,258 @@ def test_regression():
dataset = load_diabetes()
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
model = DecisionTreeRegressor(random_state=10, min_samples_split=2)
model.fit(x_train, y_train)
pred = model.predict(x_train)
QI = [0, 2, 5, 8]
base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
model = SklearnRegressor(base_est)
model.fit(ArrayDataset(x_train, y_train))
predictions = model.predict(x_train)
QI = ['age', 'bmi', 's2', 's5']
features = ['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6']
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, features=features, is_regression=True,
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
features_to_minimize=QI)
gen.fit(x_train, pred)
transformed = gen.transform(x_train)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
print('Base model accuracy (R2 score): ', model.score(ArrayDataset(x_test, y_test)))
model.fit(ArrayDataset(transformed, y_train))
print('Base model accuracy (R2 score) after anonymization: ', model.score(ArrayDataset(x_test, y_test)))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {
'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
-0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773,
-0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298,
0.0017505218856967986, 0.0035667913616634905, 0.007199329789727926, 0.010831868276000023,
0.02354575227946043, 0.030810829252004623, 0.03262709779664874, 0.03444336913526058,
0.03625963814556599, 0.03807590529322624, 0.03807590715587139, 0.047157252207398415,
0.06168740428984165, 0.0635036751627922, 0.06895248219370842, 0.07258502021431923, 0.07621755823493004,
0.1034616008400917],
'bmi': [-0.07626373693346977, -0.060635464265942574, -0.056863121688365936, -0.05578530766069889,
-0.054168591275811195, -0.042312657460570335, -0.0374625027179718, -0.03422906715422869,
-0.033690162003040314, -0.03261234890669584, -0.02614547684788704, -0.025067666545510292,
-0.022373135201632977, -0.016984074376523495, -0.01375063881278038, -0.007822672137990594,
-0.004589236050378531, 0.008344509289599955, 0.015889193629845977, 0.016967005096375942,
0.024511689320206642, 0.0272062208969146, 0.030978563241660595, 0.032595280557870865,
0.033673093654215336, 0.04391230642795563, 0.04552902653813362, 0.05469042807817459,
0.06977979838848114, 0.07301323488354683, 0.09349166229367256],
's2': [-0.1044962927699089, -0.08649025857448578, -0.07740895450115204, -0.07114598527550697,
-0.06378699466586113, -0.05971606448292732, -0.04437179118394852, -0.0398311372846365,
-0.03137612994760275, -0.022138250060379505, -0.018067320343106985, -0.017910746857523918,
-0.017910745926201344, -0.01618842873722315, -0.007576846517622471, -0.007263698382303119,
-0.0010007291566580534, 0.0010347360512241721, 0.006514834007248282, 0.00933317095041275,
0.012464655097573996, 0.019197346206055954, 0.020919663831591606, 0.02217225730419159,
0.032036433927714825, 0.036420512944459915, 0.04080459102988243, 0.04127431474626064,
0.04268348217010498, 0.04424922354519367, 0.04424922540783882, 0.056462014093995094, 0.05928034894168377,
0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {},
'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (x_train[indexes])).any())
def test_X_y(data):
features = [0, 1, 2]
X = np.array([[23, 165, 70],
[45, 158, 67],
[56, 123, 65],
[67, 154, 90],
[45, 149, 67],
[42, 166, 58],
[73, 172, 68],
[94, 168, 69],
[69, 175, 80],
[24, 181, 95],
[18, 190, 102]])
print(X)
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
QI = [0, 2]
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(X, y))
predictions = model.predict(X)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
gen.fit(X=X, y=predictions)
transformed = gen.transform(X)
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
modified_features = [f for f in features if
str(f) in expexted_generalizations['categories'].keys() or str(f) in expexted_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
def test_X_y_features_names(data):
features = ['age', 'height', 'weight']
X = np.array([[23, 165, 70],
[45, 158, 67],
[56, 123, 65],
[67, 154, 90],
[45, 149, 67],
[42, 166, 58],
[73, 172, 68],
[94, 168, 69],
[69, 175, 80],
[24, 181, 95],
[18, 190, 102]])
print(X)
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
QI = ['age', 'weight']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
model.fit(ArrayDataset(X, y))
predictions = model.predict(X)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
gen.fit(X=X, y=predictions, features_names=features)
transformed = gen.transform(X=X, features_names=features)
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
def test_BaseEstimator_classification(data):
features = ['age', 'height', 'weight', 'sex', 'ola']
X = [[23, 165, 65, 'f', 'aa'],
[45, 158, 76, 'f', 'aa'],
[56, 123, 78, 'f', 'bb'],
[67, 154, 87, 'm', 'aa'],
[45, 149, 45, 'f', 'bb'],
[42, 166, 76, 'm', 'bb'],
[73, 172, 85, 'm', 'bb'],
[94, 168, 92, 'f', 'aa'],
[69, 175, 95, 'm', 'aa'],
[24, 181, 49, 'm', 'bb'],
[18, 190, 69, 'm', 'bb']]
y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
QI = ['age', 'weight', 'ola']
numeric_features = ["age", "height", "weight"]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_features = ["sex", "ola"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(X)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = base_est
model.fit(encoded, y)
predictions = model.predict(encoded)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
categorical_features=categorical_features, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
'untouched': ['height', 'sex']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
# assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
# assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
def test_BaseEstimator_regression():
dataset = load_diabetes()
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
model = base_est
model.fit(x_train, y_train)
predictions = model.predict(x_train)
QI = ['age', 'bmi', 's2', 's5']
features = ['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6']
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
features_to_minimize=QI)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
model.fit(transformed, y_train)
print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
@ -546,7 +832,7 @@ def test_regression():
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[