diff --git a/apt/anonymization/anonymizer.py b/apt/anonymization/anonymizer.py
index 9f82c7c..02854f5 100644
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@@ -8,6 +8,7 @@ from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
+from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
from typing import Union, Optional
@@ -49,61 +50,64 @@ class Anonymize:
self.categorical_features = categorical_features
self.is_regression = is_regression
self.train_only_QI = train_only_QI
+ self.features_names = None
+ self.features = None
- def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \
- -> Union[np.ndarray, pd.DataFrame]:
+ def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
"""
Method for performing model-guided anonymization.
- :param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and
- categorical data.
- :param y: The predictions of the original model on the training data.
+ :param dataset: Data wrapper containing the training data for the model and the predictions of the
+ original model on the training data.
:return: An array containing the anonymized training dataset.
"""
- if type(x) == np.ndarray:
- self.features = [i for i in range(x.shape[1])]
- return self._anonymize_ndarray(x.copy(), y)
- else: # pandas
- self.features = x.columns
- if not self.categorical_features:
- raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
- return self._anonymize_pandas(x.copy(), y)
+ if dataset.get_samples().shape[1] != 0:
+ self.features = [i for i in range(dataset.get_samples().shape[1])]
+ else:
+ raise ValueError('No data provided')
- def _anonymize_ndarray(self, x, y):
+ if dataset.features_names is not None:
+ self.features_names = dataset.features_names
+ else: # if no names provided, use numbers instead
+ self.features_names = self.features
+
+ if not set(self.quasi_identifiers).issubset(set(self.features_names)):
+ raise ValueError('Quasi identifiers should bs a subset of the supplied features or indexes in range of '
+ 'the data columns')
+ if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
+ raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
+ 'the data columns')
+ self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
+ if self.categorical_features:
+ self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]
+
+ transformed = self._anonymize(dataset.get_samples().copy(), dataset.get_labels())
+ if dataset.is_pandas:
+ return pd.DataFrame(transformed, columns=self.features_names)
+ else:
+ return transformed
+
+ def _anonymize(self, x, y):
if x.shape[0] != y.shape[0]:
raise ValueError("x and y should have same number of rows")
- x_anonymizer_train = x
- if self.train_only_QI:
- # build DT just on QI features
- x_anonymizer_train = x[:, self.quasi_identifiers]
if x.dtype.kind not in 'iufc':
- x_prepared = self._modify_categorical_features(x_anonymizer_train)
+ if not self.categorical_features:
+ raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
+ x_prepared = self._modify_categorical_features(x)
else:
- x_prepared = x_anonymizer_train
- if self.is_regression:
- self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
- else:
- self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
- self.anonymizer.fit(x_prepared, y)
- cells_by_id = self._calculate_cells(x, x_prepared)
- return self._anonymize_data_numpy(x, x_prepared, cells_by_id)
-
- def _anonymize_pandas(self, x, y):
- if x.shape[0] != y.shape[0]:
- raise ValueError("x and y should have same number of rows")
- x_anonymizer_train = x
+ x_prepared = x
+ x_anonymizer_train = x_prepared
if self.train_only_QI:
# build DT just on QI features
- x_anonymizer_train = x.loc[:, self.quasi_identifiers]
- # need to one-hot encode before training the decision tree
- x_prepared = self._modify_categorical_features(x_anonymizer_train)
+ x_anonymizer_train = x_prepared[:, self.quasi_identifiers]
if self.is_regression:
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
else:
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
- self.anonymizer.fit(x_prepared, y)
- cells_by_id = self._calculate_cells(x, x_prepared)
- return self._anonymize_data_pandas(x, x_prepared, cells_by_id)
+
+ self.anonymizer.fit(x_anonymizer_train, y)
+ cells_by_id = self._calculate_cells(x, x_anonymizer_train)
+ return self._anonymize_data(x, x_anonymizer_train, cells_by_id)
def _calculate_cells(self, x, x_anonymizer_train):
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
@@ -130,15 +134,9 @@ class Anonymize:
# get all rows in cell
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
# TODO: should we filter only those with majority label? (using hist)
- if type(x) == np.ndarray:
- rows = x[indexes]
- else: # pandas
- rows = x.iloc[indexes]
+ rows = x[indexes]
for feature in self.quasi_identifiers:
- if type(x) == np.ndarray:
- values = rows[:, feature]
- else: # pandas
- values = rows.loc[:, feature]
+ values = rows[:, feature]
if self.categorical_features and feature in self.categorical_features:
# find most common value
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
@@ -163,7 +161,7 @@ class Anonymize:
node_ids = self._find_sample_nodes(samples)
return [cells_by_id[node_id] for node_id in node_ids]
- def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
+ def _anonymize_data(self, x, x_anonymizer_train, cells_by_id):
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
index = 0
for row in x:
@@ -173,22 +171,12 @@ class Anonymize:
row[feature] = cell['representative'][feature]
return x
- def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id):
- cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
- index = 0
- for i, row in x.iterrows():
- cell = cells[index]
- index += 1
- for feature in cell['representative']:
- x.at[i, feature] = cell['representative'][feature]
- return x
-
def _modify_categorical_features(self, x):
# prepare data for DT
used_features = self.features
if self.train_only_QI:
used_features = self.quasi_identifiers
- numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features]
+ numeric_features = [f for f in self.features if f in used_features and f not in self.categorical_features]
categorical_features = [f for f in self.categorical_features if f in used_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py
index d04cc03..27b6b6e 100644
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@@ -1,7 +1,7 @@
"""
This module implements all classes needed to perform data minimization
"""
-from typing import Union
+from typing import Union, Optional
import pandas as pd
import numpy as np
import copy
@@ -16,6 +16,9 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
+from apt.utils.datasets import ArrayDataset, Data, DATA_PANDAS_NUMPY_TYPE
+from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnClassifier
+
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
""" A transformer that generalizes data to representative points.
@@ -24,16 +27,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
and a target accuracy. Once the generalizations are learned, can
receive one or more data records and transform them to representative
points based on the learned generalization.
-
- An alternative way to use the transformer is to supply ``cells`` and
- ``features`` in init or set_params and those will be used to transform
+ An alternative way to use the transformer is to supply ``cells`` in
+ init or set_params and those will be used to transform
data to representatives. In this case, fit must still be called but
there is no need to supply it with ``X`` and ``y``, and there is no
need to supply an existing ``estimator`` to init.
-
In summary, either ``estimator`` and ``target_accuracy`` should be
- supplied or ``cells`` and ``features`` should be supplied.
-
+ supplied or ``cells`` should be supplied.
Parameters
----------
estimator : estimator, optional
@@ -43,8 +43,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
The required accuracy when applying the base model to the
generalized data. Accuracy is measured relative to the original
accuracy of the model.
- features : list of str, optional
- The feature names, in the order that they appear in the data.
categorical_features: list of str, optional
The list of categorical features should only be supplied when
passing data as a pandas dataframe.
@@ -67,28 +65,29 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
Attributes
----------
+ features_ : list of str
+ The feature names, in the order that they appear in the data.
cells_ : list of object
The cells used to generalize records, as learned when calling fit.
-
ncp_ : float
The NCP (information loss) score of the resulting generalization,
as measured on the training data.
-
generalizations_ : object
The generalizations that were learned (actual feature ranges).
-
- Notes
- -----
-
-
"""
- def __init__(self, estimator=None, target_accuracy=0.998, features=None,
- cells=None, categorical_features=None, features_to_minimize: Union[np.ndarray, list] = None
- , train_only_QI=True, is_regression=False):
- self.estimator = estimator
+ def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998,
+ cells: list = None, categorical_features: Union[np.ndarray, list] = None,
+ features_to_minimize: Union[np.ndarray, list] = None, train_only_QI: bool = True,
+ is_regression: bool = False):
+ if issubclass(estimator.__class__, Model):
+ self.estimator = estimator
+ else:
+ if is_regression:
+ self.estimator = SklearnRegressor(estimator)
+ else:
+ self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_VECTOR)
self.target_accuracy = target_accuracy
- self.features = features
self.cells = cells
self.categorical_features = []
if categorical_features:
@@ -114,11 +113,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
ret = {}
ret['target_accuracy'] = self.target_accuracy
if deep:
- ret['features'] = copy.deepcopy(self.features)
ret['cells'] = copy.deepcopy(self.cells)
ret['estimator'] = self.estimator
else:
- ret['features'] = copy.copy(self.features)
ret['cells'] = copy.copy(self.cells)
return ret
@@ -132,8 +129,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
"""
if 'target_accuracy' in params:
self.target_accuracy = params['target_accuracy']
- if 'features' in params:
- self.features = params['features']
if 'cells' in params:
self.cells = params['cells']
return self
@@ -142,7 +137,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def generalizations(self):
return self.generalizations_
- def fit_transform(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
+ def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
+ features_names: Optional = None, dataset: Optional[ArrayDataset] = None):
"""Learns the generalizations based on training data, and applies them to the data.
Parameters
@@ -152,17 +148,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
y : array-like, shape (n_samples,), optional
The target values. An array of int.
This should contain the predictions of the original model on ``X``.
-
+ features_names : list of str, The feature names, in the order that they appear in the data,
+ provided just if X and y were provided (optional).
+ dataset : Data wrapper containing the training input samples and the predictions of the
+ original model on the training data.
+ Either X,y OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
"""
- self.fit(X, y)
- return self.transform(X)
+ self.fit(X, y, features_names, dataset=dataset)
+ return self.transform(X, features_names, dataset=dataset)
- def fit(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
+ def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
+ features_names: Optional = None, dataset: ArrayDataset = None):
"""Learns the generalizations based on training data.
Parameters
@@ -172,7 +173,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
y : array-like, shape (n_samples,), optional
The target values. An array of int.
This should contain the predictions of the original model on ``X``.
-
+ features_names : list of str, The feature names, in the order that they appear in the data,
+ provided just if X and y were provided (optional).
+ dataset : Data wrapper containing the training input samples and the predictions of the
+ original model on the training data.
+ Either X,y OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
@@ -181,26 +186,25 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
"""
# take into account that estimator, X, y, cells, features may be None
- if X is not None:
- if type(X) == np.ndarray:
- self.is_numpy = True
- else:
- self.is_numpy = False
-
if X is not None and y is not None:
- if self.is_numpy:
- X, y = check_X_y(X, y, accept_sparse=True)
- self.n_features_ = X.shape[1]
- elif self.features:
- self.n_features_ = len(self.features)
+ if dataset is not None:
+ raise ValueError('Either X,y OR dataset need to be provided, not both')
+ else:
+ dataset = ArrayDataset(X, y, features_names)
+
+ if dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
+ self.n_features_ = dataset.get_samples().shape[1]
+
+ elif dataset and dataset.features_names:
+ self.n_features_ = len(dataset.features_names)
else:
self.n_features_ = 0
- if self.features:
- self._features = self.features
+ if dataset and dataset.features_names:
+ self._features = dataset.features_names
# if features is None, use numbers instead of names
elif self.n_features_ != 0:
- self._features = [i for i in range(self.n_features_)]
+ self._features = [str(i) for i in range(self.n_features_)]
else:
self._features = None
@@ -212,27 +216,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# Going to fit
# (currently not dealing with option to fit with only X and y and no estimator)
- if self.estimator and X is not None and y is not None:
+ if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
+ x = pd.DataFrame(dataset.get_samples(), columns=self._features)
+ if not self.features_to_minimize:
+ self.features_to_minimize = self._features
+ self.features_to_minimize = [str(i) for i in self.features_to_minimize]
+ if not all(elem in self._features for elem in self.features_to_minimize):
+ raise ValueError('features to minimize should be a subset of features names')
+ x_QI = x.loc[:, self.features_to_minimize]
- if self.is_numpy:
- if not self.features_to_minimize:
- self.features_to_minimize = [i for i in range(len(self._features))]
- x_QI = X[:, self.features_to_minimize]
- self.features_to_minimize = [self._features[i] for i in self.features_to_minimize]
- X = pd.DataFrame(X, columns=self._features)
- else:
- if not self.features_to_minimize:
- self.features_to_minimize = self._features
- x_QI = X.loc[:, self.features_to_minimize]
- x_QI = pd.DataFrame(x_QI, columns=self.features_to_minimize)
# divide dataset into train and test
- used_data = X
+ used_data = x
if self.train_only_QI:
used_data = x_QI
if self.is_regression:
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=14)
+ X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4, random_state=14)
else:
- X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=18)
+ X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), stratify=dataset.get_labels(), test_size=0.4,
+ random_state=18)
X_train_QI = X_train.loc[:, self.features_to_minimize]
X_test_QI = X_test.loc[:, self.features_to_minimize]
@@ -246,7 +247,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
for feature in self._features:
if feature not in feature_data.keys():
fd = {}
- values = list(X.loc[:, feature])
+ values = list(x.loc[:, feature])
if feature not in self.categorical_features:
fd['min'] = min(values)
fd['max'] = max(values)
@@ -259,7 +260,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
categorical_features = [f for f in self._features if f in self.categorical_features and
f in self.features_to_minimize]
-
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
@@ -288,7 +288,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
("cat", categorical_transformer, self.categorical_features),
]
)
- preprocessor.fit(X)
+ preprocessor.fit(x)
x_prepared = preprocessor.transform(X_train)
if self.train_only_QI:
x_prepared = preprocessor_QI_features.transform(X_train_QI)
@@ -300,7 +300,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.dt_ = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
else:
self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
- min_samples_leaf=1)
+ min_samples_leaf=1)
self.dt_.fit(x_prepared, y_train)
self._modify_categorical_features(used_data)
@@ -329,7 +329,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
# check accuracy
- accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
+ accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
print('Initial accuracy of model on generalized data, relative to original model predictions '
'(base generalization derived from tree, before improvements): %f' % accuracy)
@@ -349,7 +349,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_,
self.cells_by_id_)
- accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
+ accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
# if accuracy passed threshold roll back to previous iteration generalizations
if accuracy < self.target_accuracy:
self.cells_ = cells_previous_iter
@@ -375,7 +375,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
- accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
+ accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self.cells_ currently holds the chosen generalization based on target accuracy
@@ -386,7 +386,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# Return the transformer
return self
- def transform(self, X: Union[np.ndarray, pd.DataFrame]):
+ def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None):
""" Transforms data records to representative points.
Parameters
@@ -394,6 +394,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
may contain both numeric and categorical data.
The input samples.
+ features_names : list of str, The feature names, in the order that they appear in the data,
+ provided just if X was provided (optional).
+ dataset : Data wrapper containing the training input samples.
+ Either X OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
@@ -405,26 +409,30 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
msg = 'This %(name)s instance is not initialized yet. ' \
'Call ‘fit’ or ‘set_params’ with ' \
'appropriate arguments before using this method.'
- check_is_fitted(self, ['cells', 'features'], msg=msg)
+ check_is_fitted(self, ['cells'], msg=msg)
- if type(X) == np.ndarray:
- # Input validation
- X = check_array(X, accept_sparse=True)
- self.is_numpy = True
- X = pd.DataFrame(X, columns=self._features)
- else:
- self.is_numpy = False
+ if X is not None:
+ if dataset is not None:
+ raise ValueError('Either X OR dataset need to be provided, not both')
+ else:
+ dataset = ArrayDataset(X, features_names=features_names)
+ elif dataset is None:
+ raise ValueError('Either X OR dataset need to be provided, not both')
+ if dataset and dataset.features_names:
+ self._features = dataset.features_names
+ if dataset and dataset.get_samples() is not None:
+ x = pd.DataFrame(dataset.get_samples(), columns=self._features)
- if X.shape[1] != self.n_features_ and self.n_features_ != 0:
+ if x.shape[1] != self.n_features_ and self.n_features_ != 0:
raise ValueError('Shape of input is different from what was seen'
'in `fit`')
if not self._features:
- self._features = [i for i in range(X.shape[1])]
+ self._features = [i for i in range(x.shape[1])]
representatives = pd.DataFrame(columns=self._features) # only columns
- generalized = pd.DataFrame(X, columns=self._features, copy=True) # original data
- mapped = np.zeros(X.shape[0]) # to mark records we already mapped
+ generalized = pd.DataFrame(x, columns=self._features, copy=True) # original data
+ mapped = np.zeros(x.shape[0]) # to mark records we already mapped
# iterate over cells (leaves in decision tree)
for i in range(len(self.cells_)):
@@ -443,7 +451,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
representatives = representatives.drop(feature, axis=1)
# get the indexes of all records that map to this cell
- indexes = self._get_record_indexes_for_cell(X, self.cells_[i], mapped)
+ indexes = self._get_record_indexes_for_cell(x, self.cells_[i], mapped)
# replace the values in the representative columns with the representative
# values (leaves others untouched)
@@ -454,9 +462,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
replace.index = indexes
generalized.loc[indexes, representatives.columns] = replace
- if self.is_numpy:
- return generalized.to_numpy()
- return generalized
+ if dataset and dataset.is_pandas:
+ return generalized
+ elif isinstance(X, pd.DataFrame):
+ return generalized
+ return generalized.to_numpy()
def _get_record_indexes_for_cell(self, X, cell, mapped):
indexes = []
@@ -640,7 +650,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# else: nothing to do, stay with previous cells
def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
- new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
+ new_cell['hist'] = [x + y for x, y in
+ zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1
def _get_nodes_level(self, level):
@@ -797,8 +808,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cells_by_id = copy.deepcopy(self.cells_by_id_)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
- accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
- labels) - current_accuracy
+ accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
+ labels)) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
@@ -820,8 +831,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cells_by_id = copy.deepcopy(self.cells_by_id_)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
- accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
- labels) - current_accuracy
+ accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
+ labels)) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
diff --git a/apt/utils/__init__.py b/apt/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/apt/utils.py b/apt/utils/dataset_utils.py
similarity index 93%
rename from apt/utils.py
rename to apt/utils/dataset_utils.py
index bc73cbc..e3eb959 100644
--- a/apt/utils.py
+++ b/apt/utils/dataset_utils.py
@@ -13,8 +13,7 @@ def _load_iris(test_set_size: float = 0.3):
# Split training and test sets
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
- random_state=18, stratify=labels,
- shuffle=True)
+ random_state=18, stratify=labels)
return (x_train, y_train), (x_test, y_test)
@@ -29,6 +28,28 @@ def get_iris_dataset(test_set: float = 0.3):
return _load_iris(test_set)
+def _load_diabetes(test_set_size: float = 0.3):
+ diabetes = datasets.load_diabetes()
+ data = diabetes.data
+ labels = diabetes.target
+
+ # Split training and test sets
+ x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
+ random_state=18)
+
+ return (x_train, y_train), (x_test, y_test)
+
+
+def get_diabetes_dataset():
+ """
+ Loads the Iris dataset from scikit-learn.
+
+ :param test_set: Proportion of the data to use as validation split (value between 0 and 1).
+ :return: Entire dataset and labels as numpy array.
+ """
+ return _load_diabetes()
+
+
def get_german_credit_dataset(test_set: float = 0.3):
"""
Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
@@ -253,7 +274,7 @@ def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_socia
raise Exception("Bad label value: %s" % value)
data["label"] = data["label"].apply(modify_label)
- data["children"] = data["children"].apply(lambda x: 4 if x == "more" else x)
+ data["children"] = data["children"].apply(lambda x: "4" if x == "more" else x)
if transform_social:
diff --git a/apt/utils/datasets/__init__.py b/apt/utils/datasets/__init__.py
new file mode 100644
index 0000000..6e7c640
--- /dev/null
+++ b/apt/utils/datasets/__init__.py
@@ -0,0 +1,7 @@
+"""
+The AI Privacy Toolbox (datasets).
+Implementation of datasets utility components for datasets creation, load, and store
+"""
+
+from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, \
+ OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE
diff --git a/apt/utils/datasets/datasets.py b/apt/utils/datasets/datasets.py
new file mode 100644
index 0000000..ff7c296
--- /dev/null
+++ b/apt/utils/datasets/datasets.py
@@ -0,0 +1,320 @@
+# !/usr/bin/env python
+"""
+The AI Privacy Toolbox (datasets).
+Implementation of utility classes for dataset handling
+"""
+
+from abc import ABCMeta, abstractmethod
+from typing import Callable, Collection, Any, Union, List, Optional
+
+import tarfile
+import os
+import urllib.request
+import numpy as np
+import pandas as pd
+import logging
+import torch
+from torch import Tensor
+
+logger = logging.getLogger(__name__)
+
+
+INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor]
+OUTPUT_DATA_ARRAY_TYPE = np.ndarray
+DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
+
+
+def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
+
+ """
+ converts from INPUT_DATA_ARRAY_TYPE to numpy array
+ """
+ if type(arr) == np.ndarray:
+ return arr
+ if type(arr) == pd.DataFrame or type(arr) == pd.Series:
+ self.is_pandas = True
+ return arr.to_numpy()
+ if isinstance(arr, list):
+ return np.array(arr)
+ if type(arr) == Tensor:
+ return arr.detach().cpu().numpy()
+
+ raise ValueError('Non supported type: ', type(arr).__name__)
+
+
+def array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
+ """
+ converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
+ """
+ if type(arr) == np.ndarray:
+ return torch.from_numpy(arr)
+ if type(arr) == pd.DataFrame or type(arr) == pd.Series:
+ self.is_pandas = True
+ return torch.from_numpy(arr.to_numpy())
+ if isinstance(arr, list):
+ return torch.tensor(arr)
+ if type(arr) == Tensor:
+ return arr
+
+ raise ValueError('Non supported type: ', type(arr).__name__)
+
+
+class Dataset(metaclass=ABCMeta):
+ """Base Abstract Class for Dataset"""
+
+ @abstractmethod
+ def __init__(self, **kwargs):
+ pass
+
+ @abstractmethod
+ def get_samples(self) -> Collection[Any]:
+ """Return data samples"""
+ pass
+
+ @abstractmethod
+ def get_labels(self) -> Collection[Any]:
+ """Return labels"""
+ pass
+
+
+class StoredDataset(Dataset):
+ """Abstract Class for Storable Dataset"""
+
+ @abstractmethod
+ def load_from_file(self, path: str):
+ """Load dataset from file"""
+ pass
+
+ @abstractmethod
+ def load(self, **kwargs):
+ """Load dataset"""
+ pass
+
+ @staticmethod
+ def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None:
+ """
+ Download the dataset from URL
+ :param url: dataset URL, the dataset will be requested from this URL
+ :param dest_path: local dataset destination path
+ :param filename: local dataset filename
+ :param unzip: flag whether or not perform extraction
+ :return: None
+ """
+ file_path = os.path.join(dest_path, filename)
+
+ if os.path.exists(file_path):
+ logger.warning("Files already downloaded, skipping downloading")
+
+ else:
+ os.makedirs(dest_path, exist_ok=True)
+ logger.info("Downloading the dataset...")
+ urllib.request.urlretrieve(url, file_path)
+ logger.info('Dataset Downloaded')
+
+ if unzip:
+ StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
+
+ @staticmethod
+ def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
+ """
+ Extract dataset from archived file
+ :param zip_path: path to archived file
+ :param dest_path: directory path to uncompress the file to
+ :param remove_archive: whether remove the archive file after uncompress (default False)
+ :return: None
+ """
+ logger.info("Extracting the dataset...")
+ tar = tarfile.open(zip_path)
+ tar.extractall(path=dest_path)
+
+ logger.info("Dataset was extracted to {}".format(dest_path))
+ if remove_archive:
+ logger.info("Removing a zip file")
+ os.remove(zip_path)
+ logger.info("Extracted the dataset")
+
+ @staticmethod
+ def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None:
+ """
+ Split the data and take only a part of it
+ :param datafile: dataset file path
+ :param dest_datafile: destination path for the partial dataset file
+ :param ratio: part of the dataset to save
+ :param shuffle: whether to shuffle the data or not (default True)
+ :param delimiter: dataset delimiter (default ",")
+ :param fmt: format for the correct data saving
+ :return: None
+ """
+ if os.path.isfile(dest_datafile):
+ logger.info(f"The partial debug split already exists {dest_datafile}")
+ return
+ else:
+ os.makedirs(os.path.dirname(dest_datafile), exist_ok=True)
+
+ data = np.genfromtxt(datafile, delimiter=delimiter)
+ if shuffle:
+ logger.info("Shuffling data")
+ np.random.shuffle(data)
+
+ debug_data = data[:int(len(data) * ratio)]
+ logger.info(f"Saving {ratio} of the data to {dest_datafile}")
+ np.savetxt(dest_datafile, debug_data, delimiter=delimiter, fmt=fmt)
+
+
+class ArrayDataset(Dataset):
+ """Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
+
+ def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
+ features_names: Optional = None, **kwargs):
+ """
+ ArrayDataset constructor.
+ :param x: collection of data samples
+ :param y: collection of labels (optional)
+ :param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
+ :param kwargs: dataset parameters
+ """
+ self.is_pandas = False
+ self.features_names = features_names
+ self._y = array2numpy(self, y) if y is not None else None
+ self._x = array2numpy(self, x)
+ if self.is_pandas:
+ if features_names and not np.array_equal(features_names, x.columns):
+ raise ValueError("The supplied features are not the same as in the data features")
+ self.features_names = x.columns.to_list()
+
+ if y is not None and len(self._x) != len(self._y):
+ raise ValueError('Non equivalent lengths of x and y')
+
+ def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
+ """Return data samples as numpy array"""
+ return self._x
+
+ def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
+ """Return labels as numpy array"""
+ return self._y
+
+
+class PytorchData(Dataset):
+
+ def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
+ """
+ PytorchData constructor.
+ :param x: collection of data samples
+ :param y: collection of labels (optional)
+ :param kwargs: dataset parameters
+ """
+ self.is_pandas = False
+ self._y = array2torch_tensor(self, y) if y is not None else None
+ self._x = array2torch_tensor(self, x)
+ if self.is_pandas:
+ self.features_names = x.columns
+
+ if y is not None and len(self._x) != len(self._y):
+ raise ValueError('Non equivalent lengths of x and y')
+
+
+ if self._y is not None:
+ self.__getitem__ = self.get_item
+ else:
+ self.__getitem__ = self.get_sample_item
+
+
+ def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
+ """Return data samples as numpy array"""
+ return array2numpy(self._x)
+
+ def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
+ """Return labels as numpy array"""
+ return array2numpy(self._y) if self._y is not None else None
+
+ def get_sample_item(self, idx) -> Tensor:
+ return self.x[idx]
+
+ def get_item(self, idx) -> Tensor:
+ sample, label = self.x[idx], self.y[idx]
+ return sample, label
+
+ def __len__(self):
+ return len(self.x)
+
+
+class DatasetFactory:
+ """Factory class for dataset creation"""
+ registry = {}
+
+ @classmethod
+ def register(cls, name: str) -> Callable:
+ """
+ Class method to register Dataset to the internal registry
+ :param name: dataset name
+ :return:
+ """
+
+ def inner_wrapper(wrapped_class: Dataset) -> Any:
+ if name in cls.registry:
+ logger.warning('Dataset %s already exists. Will replace it', name)
+ cls.registry[name] = wrapped_class
+ return wrapped_class
+
+ return inner_wrapper
+
+ @classmethod
+ def create_dataset(cls, name: str, **kwargs) -> Dataset:
+ """
+ Factory command to create dataset instance.
+ This method gets the appropriate Dataset class from the registry
+ and creates an instance of it, while passing in the parameters
+ given in ``kwargs``.
+ :param name: The name of the dataset to create.
+ :param kwargs: dataset parameters
+ :return: An instance of the dataset that is created.
+ """
+ if name not in cls.registry:
+ msg = f'Dataset {name} does not exist in the registry'
+ logger.error(msg)
+ raise ValueError(msg)
+
+ exec_class = cls.registry[name]
+ executor = exec_class(**kwargs)
+ return executor
+
+
+class Data:
+ def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
+ """
+ Data class constructor.
+ The class stores train and test datasets.
+ If neither of the datasets was provided,
+ Both train and test datasets will be create using
+ DatasetFactory to create a dataset instance
+ """
+ if train or test:
+ self.train = train
+ self.test = test
+ else:
+ self.train = DatasetFactory.create_dataset(train=True, **kwargs)
+ self.test = DatasetFactory.create_dataset(train=False, **kwargs)
+
+ def get_train_set(self) -> Dataset:
+ """Return train DatasetBase"""
+ return self.train
+
+ def get_test_set(self) -> Dataset:
+ """Return test DatasetBase"""
+ return self.test
+
+ def get_train_samples(self) -> Collection[Any]:
+ """Return train set samples"""
+ return self.train.get_samples()
+
+ def get_train_labels(self) -> Collection[Any]:
+ """Return train set labels"""
+ return self.train.get_labels()
+
+ def get_test_samples(self) -> Collection[Any]:
+ """Return test set samples"""
+ return self.test.get_samples()
+
+ def get_test_labels(self) -> Collection[Any]:
+ """Return test set labels"""
+ return self.test.get_labels()
diff --git a/apt/utils/models/__init__.py b/apt/utils/models/__init__.py
new file mode 100644
index 0000000..11efd5f
--- /dev/null
+++ b/apt/utils/models/__init__.py
@@ -0,0 +1,2 @@
+from apt.utils.models.model import Model, ModelOutputType
+from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor
diff --git a/apt/utils/models/model.py b/apt/utils/models/model.py
new file mode 100644
index 0000000..9616459
--- /dev/null
+++ b/apt/utils/models/model.py
@@ -0,0 +1,109 @@
+from abc import ABCMeta, abstractmethod
+from typing import Any, Optional
+from enum import Enum, auto
+
+from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
+
+
+class ModelOutputType(Enum):
+ CLASSIFIER_VECTOR = auto() # probabilities or logits
+ CLASSIFIER_SCALAR = auto() # label only
+ REGRESSOR_SCALAR = auto() # value
+
+
+class Model(metaclass=ABCMeta):
+ """
+ Abstract base class for ML model wrappers.
+ """
+
+ def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
+ unlimited_queries: Optional[bool] = True, **kwargs):
+ """
+ Initialize a `Model` wrapper object.
+
+ :param model: The original model object (of the underlying ML framework)
+ :param output_type: The type of output the model yields (vector/label only for classifiers,
+ value for regressors)
+ :param black_box_access: Boolean describing the type of deployment of the model (when in production).
+ Set to True if the model is only available via query (API) access, i.e.,
+ only the outputs of the model are exposed, and False if the model internals
+ are also available. Optional, Default is True.
+ :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
+ unlimited queries to the model API or whether there is a limit to the number of
+ queries that can be submitted. Optional, Default is True.
+ """
+ self._model = model
+ self._output_type = output_type
+ self._black_box_access = black_box_access
+ self._unlimited_queries = unlimited_queries
+
+ @abstractmethod
+ def fit(self, train_data: Dataset, **kwargs) -> None:
+ """
+ Fit the model using the training data.
+
+ :param train_data: Training data.
+ :type train_data: `Dataset`
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
+ """
+ Perform predictions using the model for input `x`.
+
+ :param x: Input samples.
+ :type x: `np.ndarray` or `pandas.DataFrame`
+ :return: Predictions from the model.
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def score(self, test_data: Dataset, **kwargs):
+ """
+ Score the model using test data.
+
+ :param test_data: Test data.
+ :type train_data: `Dataset`
+ """
+ return NotImplementedError
+
+ @property
+ def model(self) -> Any:
+ """
+ Return the model.
+
+ :return: The model.
+ """
+ return self._model
+
+ @property
+ def output_type(self) -> ModelOutputType:
+ """
+ Return the model's output type.
+
+ :return: The model's output type.
+ """
+ return self._output_type
+
+ @property
+ def black_box_access(self) -> bool:
+ """
+ Return True if the model is only available via query (API) access, i.e.,
+ only the outputs of the model are exposed, and False if the model internals are also available.
+
+ :return: True if the model is only available via query (API) access, i.e.,
+ only the outputs of the model are exposed, and False if the model internals are also available.
+ """
+ return self._black_box_access
+
+ @property
+ def unlimited_queries(self) -> bool:
+ """
+ If black_box_access is True, Return whether a user can perform unlimited queries to the model API
+ or whether there is a limit to the number of queries that can be submitted.
+
+ :return: If black_box_access is True, Return whether a user can perform unlimited queries to the model API
+ or whether there is a limit to the number of queries that can be submitted.
+ """
+ return self._unlimited_queries
diff --git a/apt/utils/models/sklearn_model.py b/apt/utils/models/sklearn_model.py
new file mode 100644
index 0000000..f7afaa6
--- /dev/null
+++ b/apt/utils/models/sklearn_model.py
@@ -0,0 +1,112 @@
+from typing import Optional
+
+import numpy as np
+
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.base import BaseEstimator
+
+from apt.utils.models import Model, ModelOutputType
+from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
+
+from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
+from art.estimators.regression.scikitlearn import ScikitlearnRegressor
+
+
+class SklearnModel(Model):
+ """
+ Wrapper class for scikitlearn models.
+ """
+ def score(self, test_data: Dataset, **kwargs):
+ """
+ Score the model using test data.
+
+ :param test_data: Test data.
+ :type train_data: `Dataset`
+ """
+ return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs)
+
+
+class SklearnClassifier(SklearnModel):
+ """
+ Wrapper class for scikitlearn classification models.
+ """
+ def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
+ unlimited_queries: Optional[bool] = True, **kwargs):
+ """
+ Initialize a `SklearnClassifier` wrapper object.
+
+ :param model: The original sklearn model object.
+ :param output_type: The type of output the model yields (vector/label only for classifiers,
+ value for regressors)
+ :param black_box_access: Boolean describing the type of deployment of the model (when in production).
+ Set to True if the model is only available via query (API) access, i.e.,
+ only the outputs of the model are exposed, and False if the model internals
+ are also available. Optional, Default is True.
+ :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
+ unlimited queries to the model API or whether there is a limit to the number of
+ queries that can be submitted. Optional, Default is True.
+ """
+ super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
+ self._art_model = ArtSklearnClassifier(model)
+
+ def fit(self, train_data: Dataset, **kwargs) -> None:
+ """
+ Fit the model using the training data.
+
+ :param train_data: Training data.
+ :type train_data: `Dataset`
+ """
+ encoder = OneHotEncoder(sparse=False)
+ y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
+ self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
+
+ def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
+ """
+ Perform predictions using the model for input `x`.
+
+ :param x: Input samples.
+ :type x: `np.ndarray` or `pandas.DataFrame`
+ :return: Predictions from the model (class probabilities, if supported).
+ """
+ return self._art_model.predict(x, **kwargs)
+
+
+class SklearnRegressor(SklearnModel):
+ """
+ Wrapper class for scikitlearn regression models.
+ """
+ def __init__(self, model: BaseEstimator, black_box_access: Optional[bool] = True,
+ unlimited_queries: Optional[bool] = True, **kwargs):
+ """
+ Initialize a `SklearnRegressor` wrapper object.
+
+ :param model: The original sklearn model object.
+ :param black_box_access: Boolean describing the type of deployment of the model (when in production).
+ Set to True if the model is only available via query (API) access, i.e.,
+ only the outputs of the model are exposed, and False if the model internals
+ are also available. Optional, Default is True.
+ :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
+ unlimited queries to the model API or whether there is a limit to the number of
+ queries that can be submitted. Optional, Default is True.
+ """
+ super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
+ self._art_model = ScikitlearnRegressor(model)
+
+ def fit(self, train_data: Dataset, **kwargs) -> None:
+ """
+ Fit the model using the training data.
+
+ :param train_data: Training data.
+ :type train_data: `Dataset`
+ """
+ self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
+
+ def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
+ """
+ Perform predictions using the model for input `x`.
+
+ :param x: Input samples.
+ :type x: `np.ndarray` or `pandas.DataFrame`
+ :return: Predictions from the model.
+ """
+ return self._art_model.predict(x, **kwargs)
diff --git a/notebooks/attribute_inference_anonymization_nursery.ipynb b/notebooks/attribute_inference_anonymization_nursery.ipynb
index 9952885..bfba540 100644
--- a/notebooks/attribute_inference_anonymization_nursery.ipynb
+++ b/notebooks/attribute_inference_anonymization_nursery.ipynb
@@ -29,198 +29,15 @@
},
{
"cell_type": "code",
- "execution_count": 61,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " parents | \n",
- " has_nurs | \n",
- " form | \n",
- " children | \n",
- " housing | \n",
- " finance | \n",
- " social | \n",
- " health | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 8450 | \n",
- " pretentious | \n",
- " very_crit | \n",
- " foster | \n",
- " 1 | \n",
- " less_conv | \n",
- " convenient | \n",
- " 1 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- " | 12147 | \n",
- " great_pret | \n",
- " very_crit | \n",
- " complete | \n",
- " 1 | \n",
- " critical | \n",
- " inconv | \n",
- " 1 | \n",
- " recommended | \n",
- "
\n",
- " \n",
- " | 2780 | \n",
- " usual | \n",
- " critical | \n",
- " complete | \n",
- " 4 | \n",
- " less_conv | \n",
- " convenient | \n",
- " 1 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- " | 11924 | \n",
- " great_pret | \n",
- " critical | \n",
- " foster | \n",
- " 1 | \n",
- " critical | \n",
- " convenient | \n",
- " 1 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- " | 59 | \n",
- " usual | \n",
- " proper | \n",
- " complete | \n",
- " 2 | \n",
- " convenient | \n",
- " convenient | \n",
- " 0 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 5193 | \n",
- " pretentious | \n",
- " less_proper | \n",
- " complete | \n",
- " 1 | \n",
- " convenient | \n",
- " inconv | \n",
- " 0 | \n",
- " recommended | \n",
- "
\n",
- " \n",
- " | 1375 | \n",
- " usual | \n",
- " less_proper | \n",
- " incomplete | \n",
- " 2 | \n",
- " less_conv | \n",
- " convenient | \n",
- " 1 | \n",
- " priority | \n",
- "
\n",
- " \n",
- " | 10318 | \n",
- " great_pret | \n",
- " less_proper | \n",
- " foster | \n",
- " 4 | \n",
- " convenient | \n",
- " convenient | \n",
- " 0 | \n",
- " priority | \n",
- "
\n",
- " \n",
- " | 6396 | \n",
- " pretentious | \n",
- " improper | \n",
- " completed | \n",
- " 3 | \n",
- " less_conv | \n",
- " convenient | \n",
- " 1 | \n",
- " recommended | \n",
- "
\n",
- " \n",
- " | 485 | \n",
- " usual | \n",
- " proper | \n",
- " incomplete | \n",
- " 1 | \n",
- " critical | \n",
- " inconv | \n",
- " 1 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- "
\n",
- "
10366 rows × 8 columns
\n",
- "
"
- ],
- "text/plain": [
- " parents has_nurs form children housing finance \\\n",
- "8450 pretentious very_crit foster 1 less_conv convenient \n",
- "12147 great_pret very_crit complete 1 critical inconv \n",
- "2780 usual critical complete 4 less_conv convenient \n",
- "11924 great_pret critical foster 1 critical convenient \n",
- "59 usual proper complete 2 convenient convenient \n",
- "... ... ... ... ... ... ... \n",
- "5193 pretentious less_proper complete 1 convenient inconv \n",
- "1375 usual less_proper incomplete 2 less_conv convenient \n",
- "10318 great_pret less_proper foster 4 convenient convenient \n",
- "6396 pretentious improper completed 3 less_conv convenient \n",
- "485 usual proper incomplete 1 critical inconv \n",
- "\n",
- " social health \n",
- "8450 1 not_recom \n",
- "12147 1 recommended \n",
- "2780 1 not_recom \n",
- "11924 1 not_recom \n",
- "59 0 not_recom \n",
- "... ... ... \n",
- "5193 0 recommended \n",
- "1375 1 priority \n",
- "10318 0 priority \n",
- "6396 1 recommended \n",
- "485 1 not_recom \n",
- "\n",
- "[10366 rows x 8 columns]"
- ]
+ "text/plain": " parents has_nurs form children housing finance \\\n8450 pretentious very_crit foster 1 less_conv convenient \n12147 great_pret very_crit complete 1 critical inconv \n2780 usual critical complete 4 less_conv convenient \n11924 great_pret critical foster 1 critical convenient \n59 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n5193 pretentious less_proper complete 1 convenient inconv \n1375 usual less_proper incomplete 2 less_conv convenient \n10318 great_pret less_proper foster 4 convenient convenient \n6396 pretentious improper completed 3 less_conv convenient \n485 usual proper incomplete 1 critical inconv \n\n social health \n8450 1 not_recom \n12147 1 recommended \n2780 1 not_recom \n11924 1 not_recom \n59 0 not_recom \n... ... ... \n5193 0 recommended \n1375 1 priority \n10318 0 priority \n6396 1 recommended \n485 1 not_recom \n\n[10366 rows x 8 columns]",
+ "text/html": "\n\n
\n \n \n | \n parents | \n has_nurs | \n form | \n children | \n housing | \n finance | \n social | \n health | \n
\n \n \n \n | 8450 | \n pretentious | \n very_crit | \n foster | \n 1 | \n less_conv | \n convenient | \n 1 | \n not_recom | \n
\n \n | 12147 | \n great_pret | \n very_crit | \n complete | \n 1 | \n critical | \n inconv | \n 1 | \n recommended | \n
\n \n | 2780 | \n usual | \n critical | \n complete | \n 4 | \n less_conv | \n convenient | \n 1 | \n not_recom | \n
\n \n | 11924 | \n great_pret | \n critical | \n foster | \n 1 | \n critical | \n convenient | \n 1 | \n not_recom | \n
\n \n | 59 | \n usual | \n proper | \n complete | \n 2 | \n convenient | \n convenient | \n 0 | \n not_recom | \n
\n \n | ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n | 5193 | \n pretentious | \n less_proper | \n complete | \n 1 | \n convenient | \n inconv | \n 0 | \n recommended | \n
\n \n | 1375 | \n usual | \n less_proper | \n incomplete | \n 2 | \n less_conv | \n convenient | \n 1 | \n priority | \n
\n \n | 10318 | \n great_pret | \n less_proper | \n foster | \n 4 | \n convenient | \n convenient | \n 0 | \n priority | \n
\n \n | 6396 | \n pretentious | \n improper | \n completed | \n 3 | \n less_conv | \n convenient | \n 1 | \n recommended | \n
\n \n | 485 | \n usual | \n proper | \n incomplete | \n 1 | \n critical | \n inconv | \n 1 | \n not_recom | \n
\n \n
\n
10366 rows × 8 columns
\n
"
},
- "execution_count": 61,
+ "execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@@ -230,7 +47,7 @@
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"\n",
- "from apt.utils import get_nursery_dataset\n",
+ "from apt.utils.dataset_utils import get_nursery_dataset\n",
"\n",
"(x_train, y_train), (x_test, y_test) = get_nursery_dataset(transform_social=True)\n",
"\n",
@@ -246,7 +63,7 @@
},
{
"cell_type": "code",
- "execution_count": 62,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -263,9 +80,9 @@
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"x_train_str = x_train.astype(str)\n",
- "train_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_train_str)\n",
+ "train_encoded = OneHotEncoder(sparse=False).fit_transform(x_train_str)\n",
"x_test_str = x_test.astype(str)\n",
- "test_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_test_str)\n",
+ "test_encoded = OneHotEncoder(sparse=False).fit_transform(x_test_str)\n",
" \n",
"model = DecisionTreeClassifier()\n",
"model.fit(train_encoded, y_train)\n",
@@ -287,7 +104,7 @@
},
{
"cell_type": "code",
- "execution_count": 91,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -323,14 +140,14 @@
},
{
"cell_type": "code",
- "execution_count": 96,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.6430638626278217\n"
+ "1.0\n"
]
}
],
@@ -361,14 +178,14 @@
},
{
"cell_type": "code",
- "execution_count": 55,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.6980513216284006\n"
+ "0.5122515917422342\n"
]
}
],
@@ -408,224 +225,43 @@
},
{
"cell_type": "code",
- "execution_count": 97,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " parents | \n",
- " has_nurs | \n",
- " form | \n",
- " children | \n",
- " housing | \n",
- " finance | \n",
- " social | \n",
- " health | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 8450 | \n",
- " pretentious | \n",
- " very_crit | \n",
- " foster | \n",
- " 1 | \n",
- " less_conv | \n",
- " convenient | \n",
- " 0 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- " | 12147 | \n",
- " great_pret | \n",
- " very_crit | \n",
- " complete | \n",
- " 1 | \n",
- " critical | \n",
- " inconv | \n",
- " 1 | \n",
- " recommended | \n",
- "
\n",
- " \n",
- " | 2780 | \n",
- " usual | \n",
- " critical | \n",
- " complete | \n",
- " 4 | \n",
- " less_conv | \n",
- " convenient | \n",
- " 0 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- " | 11924 | \n",
- " great_pret | \n",
- " critical | \n",
- " foster | \n",
- " 1 | \n",
- " critical | \n",
- " convenient | \n",
- " 0 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- " | 59 | \n",
- " usual | \n",
- " proper | \n",
- " complete | \n",
- " 2 | \n",
- " convenient | \n",
- " convenient | \n",
- " 0 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 5193 | \n",
- " pretentious | \n",
- " less_proper | \n",
- " complete | \n",
- " 1 | \n",
- " convenient | \n",
- " inconv | \n",
- " 0 | \n",
- " recommended | \n",
- "
\n",
- " \n",
- " | 1375 | \n",
- " usual | \n",
- " less_proper | \n",
- " incomplete | \n",
- " 2 | \n",
- " less_conv | \n",
- " convenient | \n",
- " 1 | \n",
- " priority | \n",
- "
\n",
- " \n",
- " | 10318 | \n",
- " great_pret | \n",
- " less_proper | \n",
- " foster | \n",
- " 4 | \n",
- " convenient | \n",
- " convenient | \n",
- " 0 | \n",
- " priority | \n",
- "
\n",
- " \n",
- " | 6396 | \n",
- " pretentious | \n",
- " improper | \n",
- " completed | \n",
- " 3 | \n",
- " less_conv | \n",
- " convenient | \n",
- " 1 | \n",
- " recommended | \n",
- "
\n",
- " \n",
- " | 485 | \n",
- " usual | \n",
- " proper | \n",
- " incomplete | \n",
- " 1 | \n",
- " critical | \n",
- " convenient | \n",
- " 0 | \n",
- " not_recom | \n",
- "
\n",
- " \n",
- "
\n",
- "
10366 rows × 8 columns
\n",
- "
"
- ],
- "text/plain": [
- " parents has_nurs form children housing finance \\\n",
- "8450 pretentious very_crit foster 1 less_conv convenient \n",
- "12147 great_pret very_crit complete 1 critical inconv \n",
- "2780 usual critical complete 4 less_conv convenient \n",
- "11924 great_pret critical foster 1 critical convenient \n",
- "59 usual proper complete 2 convenient convenient \n",
- "... ... ... ... ... ... ... \n",
- "5193 pretentious less_proper complete 1 convenient inconv \n",
- "1375 usual less_proper incomplete 2 less_conv convenient \n",
- "10318 great_pret less_proper foster 4 convenient convenient \n",
- "6396 pretentious improper completed 3 less_conv convenient \n",
- "485 usual proper incomplete 1 critical convenient \n",
- "\n",
- " social health \n",
- "8450 0 not_recom \n",
- "12147 1 recommended \n",
- "2780 0 not_recom \n",
- "11924 0 not_recom \n",
- "59 0 not_recom \n",
- "... ... ... \n",
- "5193 0 recommended \n",
- "1375 1 priority \n",
- "10318 0 priority \n",
- "6396 1 recommended \n",
- "485 0 not_recom \n",
- "\n",
- "[10366 rows x 8 columns]"
- ]
+ "text/plain": " parents has_nurs form children housing finance \\\n0 pretentious very_crit foster 1 less_conv convenient \n1 great_pret very_crit complete 1 critical inconv \n2 usual critical complete 4 less_conv convenient \n3 great_pret critical foster 1 critical convenient \n4 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n10361 pretentious less_proper complete 1 convenient inconv \n10362 usual less_proper incomplete 2 less_conv convenient \n10363 great_pret less_proper foster 4 convenient convenient \n10364 pretentious improper completed 3 less_conv convenient \n10365 usual proper incomplete 1 critical convenient \n\n social health \n0 0 not_recom \n1 1 recommended \n2 0 not_recom \n3 0 not_recom \n4 0 not_recom \n... ... ... \n10361 0 recommended \n10362 1 priority \n10363 0 priority \n10364 1 recommended \n10365 0 not_recom \n\n[10366 rows x 8 columns]",
+ "text/html": "\n\n
\n \n \n | \n parents | \n has_nurs | \n form | \n children | \n housing | \n finance | \n social | \n health | \n
\n \n \n \n | 0 | \n pretentious | \n very_crit | \n foster | \n 1 | \n less_conv | \n convenient | \n 0 | \n not_recom | \n
\n \n | 1 | \n great_pret | \n very_crit | \n complete | \n 1 | \n critical | \n inconv | \n 1 | \n recommended | \n
\n \n | 2 | \n usual | \n critical | \n complete | \n 4 | \n less_conv | \n convenient | \n 0 | \n not_recom | \n
\n \n | 3 | \n great_pret | \n critical | \n foster | \n 1 | \n critical | \n convenient | \n 0 | \n not_recom | \n
\n \n | 4 | \n usual | \n proper | \n complete | \n 2 | \n convenient | \n convenient | \n 0 | \n not_recom | \n
\n \n | ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n | 10361 | \n pretentious | \n less_proper | \n complete | \n 1 | \n convenient | \n inconv | \n 0 | \n recommended | \n
\n \n | 10362 | \n usual | \n less_proper | \n incomplete | \n 2 | \n less_conv | \n convenient | \n 1 | \n priority | \n
\n \n | 10363 | \n great_pret | \n less_proper | \n foster | \n 4 | \n convenient | \n convenient | \n 0 | \n priority | \n
\n \n | 10364 | \n pretentious | \n improper | \n completed | \n 3 | \n less_conv | \n convenient | \n 1 | \n recommended | \n
\n \n | 10365 | \n usual | \n proper | \n incomplete | \n 1 | \n critical | \n convenient | \n 0 | \n not_recom | \n
\n \n
\n
10366 rows × 8 columns
\n
"
},
- "execution_count": 97,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "from apt.utils.datasets import ArrayDataset\n",
"from apt.anonymization import Anonymize\n",
"\n",
+ "features = x_train.columns\n",
"QI = [\"finance\", \"social\", \"health\"]\n",
"categorical_features = [\"parents\", \"has_nurs\", \"form\", \"housing\", \"finance\", \"health\", 'children']\n",
- "anonymizer = Anonymize(100, QI, categorical_features=categorical_features)\n",
- "anon = anonymizer.anonymize(x_train, x_train_predictions)\n",
- "anon"
+ "QI_indexes = [i for i, v in enumerate(features) if v in QI]\n",
+ "categorical_features_indexes = [i for i, v in enumerate(features) if v in categorical_features]\n",
+ "anonymizer = Anonymize(100, QI_indexes, categorical_features=categorical_features_indexes)\n",
+ "anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
+ "anon\n"
]
},
{
"cell_type": "code",
- "execution_count": 64,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
- "text/plain": [
- "7585"
- ]
+ "text/plain": "7585"
},
- "execution_count": 64,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -637,16 +273,14 @@
},
{
"cell_type": "code",
- "execution_count": 65,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
- "text/plain": [
- "5766"
- ]
+ "text/plain": "5766"
},
- "execution_count": 65,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -665,7 +299,7 @@
},
{
"cell_type": "code",
- "execution_count": 66,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -678,7 +312,7 @@
],
"source": [
"anon_str = anon.astype(str)\n",
- "anon_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon_str)\n",
+ "anon_encoded = OneHotEncoder(sparse=False).fit_transform(anon_str)\n",
"\n",
"anon_model = DecisionTreeClassifier()\n",
"anon_model.fit(anon_encoded, y_train)\n",
@@ -698,14 +332,14 @@
},
{
"cell_type": "code",
- "execution_count": 98,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.6471155701331275\n"
+ "1.0\n"
]
}
],
@@ -734,14 +368,14 @@
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.6982442600810341\n"
+ "0.5245996527107852\n"
]
}
],
@@ -765,15 +399,15 @@
},
{
"cell_type": "code",
- "execution_count": 87,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "(0.33056202194878614, 0.2888695146759663)\n",
- "(0.34112301200908796, 0.3054344667247893)\n"
+ "(0.49415432579890883, 0.48976438779451525)\n",
+ "(0.49415432579890883, 0.48976438779451525)\n"
]
}
],
@@ -810,15 +444,15 @@
},
{
"cell_type": "code",
- "execution_count": 88,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "(0.6457357075913777, 0.2002324905550712)\n",
- "(0.6472248353715898, 0.1999418773612322)\n"
+ "(1.0, 0.019204655674102813)\n",
+ "(0.9829787234042553, 0.04481086323957323)\n"
]
}
],
@@ -849,26 +483,24 @@
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
- "anonymizer2 = Anonymize(1000, QI, categorical_features=categorical_features)\n",
- "anon2 = anonymizer2.anonymize(x_train, x_train_predictions)"
+ "anonymizer2 = Anonymize(1000, QI_indexes, categorical_features=categorical_features_indexes)\n",
+ "anon2 = anonymizer2.anonymize(ArrayDataset(x_train, x_train_predictions))"
]
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
- "text/plain": [
- "4226"
- ]
+ "text/plain": "4226"
},
- "execution_count": 75,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -887,7 +519,7 @@
},
{
"cell_type": "code",
- "execution_count": 104,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
@@ -900,7 +532,7 @@
],
"source": [
"anon2_str = anon2.astype(str)\n",
- "anon2_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon2_str)\n",
+ "anon2_encoded = OneHotEncoder(sparse=False).fit_transform(anon2_str)\n",
"\n",
"anon2_model = DecisionTreeClassifier()\n",
"anon2_model.fit(anon2_encoded, y_train)\n",
@@ -920,14 +552,14 @@
},
{
"cell_type": "code",
- "execution_count": 105,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.6266640941539648\n"
+ "1.0\n"
]
}
],
@@ -956,14 +588,14 @@
},
{
"cell_type": "code",
- "execution_count": 106,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.6944819602546788\n"
+ "0.515820953115956\n"
]
}
],
@@ -980,17 +612,17 @@
},
{
"cell_type": "code",
- "execution_count": 107,
+ "execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "(0.35793357933579334, 0.17037470725995316)\n",
- "(0.3360655737704918, 0.1680327868852459)\n",
- "(0.6457357075913777, 0.2002324905550712)\n",
- "(0.6327519379844961, 0.1897704155768672)\n"
+ "(0.49415432579890883, 0.48976438779451525)\n",
+ "(0.49415432579890883, 0.48976438779451525)\n",
+ "(1.0, 0.019204655674102813)\n",
+ "(1.0, 0.026382153249272552)\n"
]
}
],
@@ -1023,27 +655,26 @@
},
{
"cell_type": "code",
- "execution_count": 111,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"QI2 = [\"parents\", \"has_nurs\", \"form\", \"children\", \"housing\", \"finance\", \"social\", \"health\"]\n",
- "anonymizer3 = Anonymize(100, QI2, categorical_features=categorical_features)\n",
- "anon3 = anonymizer3.anonymize(x_train, x_train_predictions)"
+ "QI2_indexes = [i for i, v in enumerate(features) if v in QI2]\n",
+ "anonymizer3 = Anonymize(100, QI2_indexes, categorical_features=categorical_features_indexes)\n",
+ "anon3 = anonymizer3.anonymize(ArrayDataset(x_train, x_train_predictions))"
]
},
{
"cell_type": "code",
- "execution_count": 112,
+ "execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
- "text/plain": [
- "39"
- ]
+ "text/plain": "39"
},
- "execution_count": 112,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -1055,22 +686,22 @@
},
{
"cell_type": "code",
- "execution_count": 113,
+ "execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Anonymized model accuracy: 0.7723765432098766\n",
- "BB attack accuracy: 0.5792012348060969\n",
- "WB attack accuracy: 0.6680493922438742\n"
+ "Anonymized model accuracy: 0.751929012345679\n",
+ "BB attack accuracy: 1.0\n",
+ "WB attack accuracy: 0.5187150299054601\n"
]
}
],
"source": [
"anon3_str = anon3.astype(str)\n",
- "anon3_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon3_str)\n",
+ "anon3_encoded = OneHotEncoder(sparse=False).fit_transform(anon3_str)\n",
"\n",
"anon3_model = DecisionTreeClassifier()\n",
"anon3_model.fit(anon3_encoded, y_train)\n",
@@ -1105,17 +736,17 @@
},
{
"cell_type": "code",
- "execution_count": 114,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "(0.35793357933579334, 0.17037470725995316)\n",
- "(0.3393939393939394, 0.13114754098360656)\n",
- "(0.6457357075913777, 0.2002324905550712)\n",
- "(1, 0.0)\n"
+ "(0.49415432579890883, 0.48976438779451525)\n",
+ "(0.49415432579890883, 0.48976438779451525)\n",
+ "(1.0, 0.019204655674102813)\n",
+ "(1.0, 0.032201745877788554)\n"
]
}
],
@@ -1162,4 +793,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/notebooks/membership_inference_anonymization_adult.ipynb b/notebooks/membership_inference_anonymization_adult.ipynb
index c2c7e74..4a0ea00 100644
--- a/notebooks/membership_inference_anonymization_adult.ipynb
+++ b/notebooks/membership_inference_anonymization_adult.ipynb
@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
- "execution_count": 97,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -44,6 +44,18 @@
" [ 26. 11. 0. 0. 48.]\n",
" [ 27. 9. 0. 0. 40.]]\n"
]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_85828/3975777015.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " y_train = y_train.astype(np.int)\n",
+ "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_85828/3975777015.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " y_test = y_test.astype(np.int)\n"
+ ]
}
],
"source": [
@@ -90,14 +102,14 @@
},
{
"cell_type": "code",
- "execution_count": 116,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Base model accuracy: 0.8075056814691972\n"
+ "Base model accuracy: 0.8074442601805786\n"
]
}
],
@@ -126,9 +138,18 @@
},
{
"cell_type": "code",
- "execution_count": 124,
+ "execution_count": 8,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n"
+ ]
+ }
+ ],
"source": [
"from art.attacks.inference.membership_inference import MembershipInferenceBlackBox\n",
"\n",
@@ -154,14 +175,14 @@
},
{
"cell_type": "code",
- "execution_count": 125,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.5440363591696352\n"
+ "0.545264709495148\n"
]
}
],
@@ -197,7 +218,7 @@
},
{
"cell_type": "code",
- "execution_count": 128,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -215,6 +236,7 @@
}
],
"source": [
+ "from apt.utils.datasets import ArrayDataset\n",
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
@@ -223,22 +245,20 @@
"# QI = (age, education-num, capital-gain, hours-per-week)\n",
"QI = [0, 1, 2, 4]\n",
"anonymizer = Anonymize(100, QI)\n",
- "anon = anonymizer.anonymize(x_train, x_train_predictions)\n",
+ "anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
"print(anon)"
]
},
{
"cell_type": "code",
- "execution_count": 104,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
- "text/plain": [
- "6739"
- ]
+ "text/plain": "6739"
},
- "execution_count": 104,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -250,16 +270,14 @@
},
{
"cell_type": "code",
- "execution_count": 129,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
- "text/plain": [
- "658"
- ]
+ "text/plain": "658"
},
- "execution_count": 129,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -278,14 +296,14 @@
},
{
"cell_type": "code",
- "execution_count": 130,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Anonymized model accuracy: 0.8304158221239482\n"
+ "Anonymized model accuracy: 0.83078434985566\n"
]
}
],
@@ -308,14 +326,22 @@
},
{
"cell_type": "code",
- "execution_count": 131,
+ "execution_count": 14,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.5034393809114359\n"
+ "0.5047291487532244\n"
]
}
],
@@ -345,15 +371,15 @@
},
{
"cell_type": "code",
- "execution_count": 132,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "(0.5298924372550654, 0.7806166318634075)\n",
- "(0.5030507735890172, 0.5671293452892765)\n"
+ "(0.5312420517168291, 0.7696843139663432)\n",
+ "(0.5048372911169745, 0.4935511607910576)\n"
]
}
],
@@ -419,4 +445,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/notebooks/membership_inference_dp_diabetes_reg.ipynb b/notebooks/membership_inference_dp_diabetes_reg.ipynb
index 1376dc6..92922ab 100644
--- a/notebooks/membership_inference_dp_diabetes_reg.ipynb
+++ b/notebooks/membership_inference_dp_diabetes_reg.ipynb
@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
- "execution_count": 121,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -50,7 +50,7 @@
},
{
"cell_type": "code",
- "execution_count": 122,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -86,14 +86,14 @@
},
{
"cell_type": "code",
- "execution_count": 123,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "0.4954954954954955\n"
+ "0.527027027027027\n"
]
}
],
@@ -131,7 +131,7 @@
},
{
"cell_type": "code",
- "execution_count": 124,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -141,6 +141,22 @@
"unique rows in original data: 221\n"
]
},
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
+ "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
+ "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
+ "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
+ "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
@@ -148,11 +164,12 @@
"k values: [5, 10, 20, 50, 75]\n",
"unique rows: [34, 19, 8, 4, 2]\n",
"model accuracy: [0.43165832354998956, 0.4509641063206041, -1.730181929385853, -5.577098823982753e+27, -1.2751609045828272e+25]\n",
- "attack accuracy: [0.5, 0.47297297297297297, 0.49549549549549543, 0.5, 0.47297297297297297]\n"
+ "attack accuracy: [0.509009009009009, 0.481981981981982, 0.509009009009009, 0.5045045045045045, 0.4954954954954955]\n"
]
}
],
"source": [
+ "from apt.utils.datasets import ArrayDataset\n",
"from apt.anonymization import Anonymize\n",
"k_values=[5, 10, 20, 50, 75]\n",
"model_accuracy = []\n",
@@ -165,7 +182,7 @@
"\n",
"for k in k_values:\n",
" anonymizer = Anonymize(k, QI, is_regression=True)\n",
- " anon = anonymizer.anonymize(X_train, x_train_predictions)\n",
+ " anon = anonymizer.anonymize(ArrayDataset(X_train, x_train_predictions))\n",
" unique_values.append(len(np.unique(anon, axis=0)))\n",
" \n",
" anon_model = LinearRegression()\n",
@@ -198,7 +215,7 @@
},
{
"cell_type": "code",
- "execution_count": 124,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": []
diff --git a/notebooks/minimization_adult.ipynb b/notebooks/minimization_adult.ipynb
index 17610a3..e8ccc20 100644
--- a/notebooks/minimization_adult.ipynb
+++ b/notebooks/minimization_adult.ipynb
@@ -27,7 +27,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
@@ -42,6 +42,18 @@
" [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n",
" [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n"
]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " y_train = y_train.astype(np.int)\n",
+ "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " y_test = y_test.astype(np.int)\n"
+ ]
}
],
"source": [
@@ -84,24 +96,27 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Base model accuracy: 0.8189914624408821\n"
+ "Base model accuracy: 0.8183158282660771\n"
]
}
],
"source": [
+ "from apt.utils.datasets import ArrayDataset\n",
+ "from apt.utils.models import SklearnClassifier, ModelOutputType\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
- "model = DecisionTreeClassifier()\n",
- "model.fit(x_train, y_train)\n",
+ "base_est = DecisionTreeClassifier()\n",
+ "model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)\n",
+ "model.fit(ArrayDataset(x_train, y_train))\n",
"\n",
- "print('Base model accuracy: ', model.score(x_test, y_test))"
+ "print('Base model accuracy: ', model.score(ArrayDataset(x_test, y_test)))"
]
},
{
@@ -114,26 +129,26 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
+ "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
"Improving accuracy\n",
- "feature to remove: 0\n",
- "Removed feature: 0, new relative accuracy: 0.939867\n",
- "feature to remove: 4\n",
- "Removed feature: 4, new relative accuracy: 0.967247\n",
"feature to remove: 2\n",
- "Removed feature: 2, new relative accuracy: 0.972620\n",
+ "Removed feature: 2, new relative accuracy: 0.935261\n",
+ "feature to remove: 4\n",
+ "Removed feature: 4, new relative accuracy: 0.946776\n",
+ "feature to remove: 0\n",
+ "Removed feature: 0, new relative accuracy: 0.972876\n",
"feature to remove: 1\n",
- "Removed feature: 1, new relative accuracy: 0.992323\n",
+ "Removed feature: 1, new relative accuracy: 0.992835\n",
"feature to remove: 3\n",
"Removed feature: 3, new relative accuracy: 1.000000\n",
- "Accuracy on minimized data: 0.8237371411024106\n"
+ "Accuracy on minimized data: 0.8231229847996315\n"
]
}
],
@@ -155,10 +170,12 @@
"X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
" test_size = 0.4, random_state = 38)\n",
"x_train_predictions = model.predict(X_generalizer_train)\n",
- "minimizer.fit(X_generalizer_train, x_train_predictions)\n",
- "transformed = minimizer.transform(x_test)\n",
+ "if x_train_predictions.shape[1] > 1:\n",
+ " x_train_predictions = np.argmax(x_train_predictions, axis=1)\n",
+ "minimizer.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
+ "transformed = minimizer.transform(dataset=ArrayDataset(x_test))\n",
"\n",
- "print('Accuracy on minimized data: ', model.score(transformed, y_test))"
+ "print('Accuracy on minimized data: ', model.score(ArrayDataset(transformed, y_test)))"
]
},
{
@@ -170,14 +187,14 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "{'ranges': {}, 'untouched': [0, 1, 2, 3, 4]}\n"
+ "{'ranges': {}, 'categories': {}, 'untouched': ['4', '1', '3', '0', '2']}\n"
]
}
],
@@ -197,25 +214,25 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
+ "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
"Improving accuracy\n",
- "feature to remove: 0\n",
- "Removed feature: 0, new relative accuracy: 0.939867\n",
- "feature to remove: 4\n",
- "Removed feature: 4, new relative accuracy: 0.967247\n",
"feature to remove: 2\n",
- "Removed feature: 2, new relative accuracy: 0.972620\n",
+ "Removed feature: 2, new relative accuracy: 0.935261\n",
+ "feature to remove: 4\n",
+ "Removed feature: 4, new relative accuracy: 0.946776\n",
+ "feature to remove: 0\n",
+ "Removed feature: 0, new relative accuracy: 0.972876\n",
"feature to remove: 1\n",
- "Removed feature: 1, new relative accuracy: 0.992323\n",
- "Accuracy on minimized data: 0.820205742361431\n",
- "{'ranges': {3: [546.0, 704.0, 705.5, 742.5, 782.0, 834.0, 870.0, 1446.5, 1538.5, 1612.5, 1699.0, 1744.0, 1801.0, 1814.0, 1846.0, 1881.5, 1978.5, 2248.0, 2298.5, 2537.5]}, 'untouched': [0, 1, 2, 4]}\n"
+ "Removed feature: 1, new relative accuracy: 0.992835\n",
+ "Accuracy on minimized data: 0.8192845079072624\n",
+ "{'ranges': {'3': [569.0, 782.0, 870.0, 870.5, 938.0, 1016.5, 1311.5, 1457.0, 1494.5, 1596.0, 1629.5, 1684.0, 1805.0, 1859.0, 1867.5, 1881.5, 1938.0, 1978.5, 2119.0, 2210.0, 2218.0, 2244.5, 2298.5, 2443.5]}, 'categories': {}, 'untouched': ['2', '1', '0', '4']}\n"
]
}
],
@@ -223,9 +240,9 @@
"# We allow a 1% deviation in accuracy from the original model accuracy\n",
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.99)\n",
"\n",
- "minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
- "transformed2 = minimizer2.transform(x_test)\n",
- "print('Accuracy on minimized data: ', model.score(transformed2, y_test))\n",
+ "minimizer2.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
+ "transformed2 = minimizer2.transform(dataset=ArrayDataset(x_test))\n",
+ "print('Accuracy on minimized data: ', model.score(test_data=ArrayDataset(transformed2, y_test)))\n",
"generalizations2 = minimizer2.generalizations\n",
"print(generalizations2)"
]
@@ -259,4 +276,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index fa4131d..ec37771 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ numpy==1.21.0
pandas==1.1.0
scipy==1.4.1
scikit-learn==0.22.2
+adversarial-robustness-toolkit>=1.9.1
# testing
pytest==5.4.2
diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py
index 000eefa..358398c 100644
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@@ -7,13 +7,15 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from apt.anonymization import Anonymize
-from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
+from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
+from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
def test_anonymize_ndarray_iris():
(x_train, y_train), _ = get_iris_dataset()
+
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_train)
@@ -21,7 +23,7 @@ def test_anonymize_ndarray_iris():
k = 10
QI = [0, 2]
anonymizer = Anonymize(k, QI, train_only_QI=True)
- anon = anonymizer.anonymize(x_train, pred)
+ anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
assert (np.min(counts_elements) >= k)
@@ -30,10 +32,14 @@ def test_anonymize_ndarray_iris():
def test_anonymize_pandas_adult():
(x_train, y_train), _ = get_adult_dataset()
+ encoded = OneHotEncoder().fit_transform(x_train)
+ model = DecisionTreeClassifier()
+ model.fit(encoded, y_train)
+ pred = model.predict(encoded)
k = 100
- features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation',
- 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
+ features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
+ 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country']
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
@@ -56,12 +62,11 @@ def test_anonymize_pandas_adult():
pred = model.predict(encoded)
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
- anon = anonymizer.anonymize(x_train, pred)
+ anon = anonymizer.anonymize(ArrayDataset(x_train, pred, features))
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
assert (anon.loc[:, QI].value_counts().min() >= k)
- assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
-
+ np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
def test_anonymize_pandas_nursery():
(x_train, y_train), _ = get_nursery_dataset()
@@ -89,11 +94,11 @@ def test_anonymize_pandas_nursery():
pred = model.predict(encoded)
anonymizer = Anonymize(k, QI, categorical_features=categorical_features, train_only_QI=True)
- anon = anonymizer.anonymize(x_train, pred)
+ anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
assert (anon.loc[:, QI].value_counts().min() >= k)
- assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+ np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
def test_regression():
@@ -107,7 +112,7 @@ def test_regression():
k = 10
QI = [0, 2, 5, 8]
anonymizer = Anonymize(k, QI, is_regression=True, train_only_QI=True)
- anon = anonymizer.anonymize(x_train, pred)
+ anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
model.fit(anon, y_train)
print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
@@ -127,7 +132,7 @@ def test_errors():
anonymizer = Anonymize(10, [0, 2])
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
with pytest.raises(ValueError):
- anonymizer.anonymize(x_train, y_test)
+ anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
(x_train, y_train), _ = get_adult_dataset()
with pytest.raises(ValueError):
- anonymizer.anonymize(x_train, y_train)
+ anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py
index e6f50be..630cd49 100644
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@@ -5,14 +5,15 @@ from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_boston, load_diabetes
from sklearn.impute import SimpleImputer
-from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.preprocessing import OneHotEncoder
from apt.minimization import GeneralizeToRepresentative
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
-from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
+from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
+from apt.utils.datasets import ArrayDataset
+from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor
@pytest.fixture
@@ -38,11 +39,12 @@ def test_minimizer_params(data):
y = [1, 1, 0]
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
- base_est.fit(X, y)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(X, y))
- gen = GeneralizeToRepresentative(base_est, features=features, cells=cells)
+ gen = GeneralizeToRepresentative(model, cells=cells)
gen.fit()
- transformed = gen.transform(X)
+ transformed = gen.transform(dataset=ArrayDataset(X, features_names=features))
def test_minimizer_fit(data):
@@ -58,15 +60,20 @@ def test_minimizer_fit(data):
[69, 175],
[24, 181],
[18, 190]])
- y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
+ y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
- base_est.fit(X, y)
- predictions = base_est.predict(X)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(X, y))
+ predictions = model.predict(X)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
- gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5)
- gen.fit(X, predictions)
- transformed = gen.transform(X)
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
+ train_dataset = ArrayDataset(X, predictions, features_names=features)
+
+ gen.fit(dataset=train_dataset)
+ transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}
@@ -103,7 +110,7 @@ def test_minimizer_fit_pandas(data):
[69, 175, 'm', 'aa'],
[24, 181, 'm', 'bb'],
[18, 190, 'm', 'bb']]
- y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
+ y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
numeric_features = ["age", "height"]
@@ -121,16 +128,22 @@ def test_minimizer_fit_pandas(data):
]
)
encoded = preprocessor.fit_transform(X)
+ encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
- base_est.fit(encoded, y)
- predictions = base_est.predict(encoded)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(encoded, y))
+ predictions = model.predict(encoded)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
+
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
- gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
categorical_features=categorical_features)
- gen.fit(X, predictions)
- transformed = gen.transform(X)
+ train_dataset = ArrayDataset(X, predictions)
+ gen.fit(dataset=train_dataset)
+ transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['ola', 'height', 'sex']}
@@ -143,7 +156,7 @@ def test_minimizer_fit_pandas(data):
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
- assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
+ np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
@@ -179,7 +192,7 @@ def test_minimizer_params_categorical(data):
[24, 181, 'm'],
[18, 190, 'm']]
- y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
+ y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
numeric_features = ["age", "height"]
numeric_transformer = Pipeline(
@@ -196,16 +209,21 @@ def test_minimizer_params_categorical(data):
]
)
encoded = preprocessor.fit_transform(X)
+ encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
- base_est.fit(encoded, y)
- predictions = base_est.predict(encoded)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(encoded, y))
+ predictions = model.predict(encoded)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
- gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
categorical_features=categorical_features, cells=cells)
- gen.fit(X, predictions)
- transformed = gen.transform(X)
+ train_dataset = ArrayDataset(X, predictions)
+ gen.fit(dataset=train_dataset)
+ transformed = gen.transform(dataset=ArrayDataset(X))
def test_minimizer_fit_QI(data):
@@ -222,16 +240,20 @@ def test_minimizer_fit_QI(data):
[24, 181, 95],
[18, 190, 102]])
print(X)
- y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
- QI = [0, 2]
+ y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+ QI = ['age', 'weight']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
- base_est.fit(X, y)
- predictions = base_est.predict(X)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(X, y))
+ predictions = model.predict(X)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
- gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, features_to_minimize=QI)
- gen.fit(X, predictions)
- transformed = gen.transform(X)
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
+ train_dataset = ArrayDataset(X, predictions, features_names=features)
+ gen.fit(dataset=train_dataset)
+ transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
for key in expexted_generalizations['ranges']:
@@ -240,7 +262,7 @@ def test_minimizer_fit_QI(data):
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
- assert ((np.delete(transformed, QI, axis=1) == np.delete(X, QI, axis=1)).all())
+ assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
@@ -269,7 +291,7 @@ def test_minimizer_fit_pandas_QI(data):
[24, 181, 49, 'm', 'bb'],
[18, 190, 69, 'm', 'bb']]
- y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
+ y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
QI = ['age', 'weight', 'ola']
@@ -288,16 +310,22 @@ def test_minimizer_fit_pandas_QI(data):
]
)
encoded = preprocessor.fit_transform(X)
+ encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
- base_est.fit(encoded, y)
- predictions = base_est.predict(encoded)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(encoded, y))
+ predictions = model.predict(encoded)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
+
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
- gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
categorical_features=categorical_features, features_to_minimize=QI)
- gen.fit(X, predictions)
- transformed = gen.transform(X)
+ train_dataset = ArrayDataset(X, predictions)
+ gen.fit(dataset=train_dataset)
+ transformed = gen.transform(dataset=ArrayDataset(X))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
'untouched': ['height', 'sex']}
@@ -308,12 +336,13 @@ def test_minimizer_fit_pandas_QI(data):
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
- assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
-
+ # assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
+ np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
- assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
+ # assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
+ np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
@@ -322,16 +351,19 @@ def test_minimizer_fit_pandas_QI(data):
def test_minimize_ndarray_iris():
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
- (x_train, y_train), _ = get_iris_dataset()
- QI = [0, 2]
- model = DecisionTreeClassifier(random_state=0, min_samples_split=2,
- min_samples_leaf=1)
- model.fit(x_train, y_train)
- pred = model.predict(x_train)
+ (x_train, y_train), (x_test, y_test) = get_iris_dataset()
+ QI = ['sepal length (cm)', 'petal length (cm)']
+ base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
+ min_samples_leaf=1)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(x_train, y_train))
+ predictions = model.predict(x_train)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
- gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features=features, features_to_minimize=QI)
- gen.fit(x_train, pred)
- transformed = gen.transform(x_train)
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features_to_minimize=QI)
+ # gen.fit(dataset=ArrayDataset(x_train, predictions))
+ transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
'categories': {}, 'untouched': ['petal width (cm)', 'sepal width (cm)']}
@@ -342,7 +374,7 @@ def test_minimize_ndarray_iris():
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
- assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
+ assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x_train, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
@@ -359,12 +391,13 @@ def test_minimize_ndarray_iris():
def test_minimize_pandas_adult():
- (x_train, y_train), _ = get_adult_dataset()
+ (x_train, y_train), (x_test, y_test) = get_adult_dataset()
x_train = x_train.head(1000)
y_train = y_train.head(1000)
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
+ x_train = pd.DataFrame(x_train, columns=features)
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'hours-per-week', 'native-country']
@@ -384,15 +417,19 @@ def test_minimize_pandas_adult():
]
)
encoded = preprocessor.fit_transform(x_train)
+ encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
- base_est.fit(encoded, y_train)
- predictions = base_est.predict(encoded)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(encoded, y_train))
+ predictions = model.predict(encoded)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
- gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
categorical_features=categorical_features, features_to_minimize=QI)
- gen.fit(x_train, predictions)
- transformed = gen.transform(x_train)
+ gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
+ transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': {
'workclass': [['Self-emp-not-inc', 'Private', 'Federal-gov', 'Self-emp-inc', '?', 'Local-gov', 'State-gov']],
@@ -414,12 +451,14 @@ def test_minimize_pandas_adult():
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
- assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+ # assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+ np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
- assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
+ # assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
+ np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
@@ -451,15 +490,19 @@ def test_german_credit_pandas():
]
)
encoded = preprocessor.fit_transform(x_train)
+ encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
- base_est.fit(encoded, y_train)
- predictions = base_est.predict(encoded)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(encoded, y_train))
+ predictions = model.predict(encoded)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
- gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
categorical_features=categorical_features, features_to_minimize=QI)
- gen.fit(x_train, predictions)
- transformed = gen.transform(x_train)
+ gen.fit(dataset=ArrayDataset(x_train, predictions))
+ transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'Duration_in_month': [31.5]},
'categories': {'Credit_history': [['A30', 'A32', 'A31', 'A34', 'A33']], 'Purpose': [
@@ -481,12 +524,14 @@ def test_german_credit_pandas():
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
- assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+ # assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+ np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
- assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
+ # assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
+ np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
@@ -497,17 +542,258 @@ def test_regression():
dataset = load_diabetes()
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
- model = DecisionTreeRegressor(random_state=10, min_samples_split=2)
- model.fit(x_train, y_train)
- pred = model.predict(x_train)
- QI = [0, 2, 5, 8]
+ base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
+ model = SklearnRegressor(base_est)
+ model.fit(ArrayDataset(x_train, y_train))
+ predictions = model.predict(x_train)
+ QI = ['age', 'bmi', 's2', 's5']
features = ['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6']
- gen = GeneralizeToRepresentative(model, target_accuracy=0.7, features=features, is_regression=True,
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
features_to_minimize=QI)
- gen.fit(x_train, pred)
- transformed = gen.transform(x_train)
+ gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
+ transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
+ print('Base model accuracy (R2 score): ', model.score(ArrayDataset(x_test, y_test)))
+ model.fit(ArrayDataset(transformed, y_train))
+ print('Base model accuracy (R2 score) after anonymization: ', model.score(ArrayDataset(x_test, y_test)))
+ gener = gen.generalizations_
+ expexted_generalizations = {'ranges': {
+ 'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
+ -0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773,
+ -0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298,
+ 0.0017505218856967986, 0.0035667913616634905, 0.007199329789727926, 0.010831868276000023,
+ 0.02354575227946043, 0.030810829252004623, 0.03262709779664874, 0.03444336913526058,
+ 0.03625963814556599, 0.03807590529322624, 0.03807590715587139, 0.047157252207398415,
+ 0.06168740428984165, 0.0635036751627922, 0.06895248219370842, 0.07258502021431923, 0.07621755823493004,
+ 0.1034616008400917],
+ 'bmi': [-0.07626373693346977, -0.060635464265942574, -0.056863121688365936, -0.05578530766069889,
+ -0.054168591275811195, -0.042312657460570335, -0.0374625027179718, -0.03422906715422869,
+ -0.033690162003040314, -0.03261234890669584, -0.02614547684788704, -0.025067666545510292,
+ -0.022373135201632977, -0.016984074376523495, -0.01375063881278038, -0.007822672137990594,
+ -0.004589236050378531, 0.008344509289599955, 0.015889193629845977, 0.016967005096375942,
+ 0.024511689320206642, 0.0272062208969146, 0.030978563241660595, 0.032595280557870865,
+ 0.033673093654215336, 0.04391230642795563, 0.04552902653813362, 0.05469042807817459,
+ 0.06977979838848114, 0.07301323488354683, 0.09349166229367256],
+ 's2': [-0.1044962927699089, -0.08649025857448578, -0.07740895450115204, -0.07114598527550697,
+ -0.06378699466586113, -0.05971606448292732, -0.04437179118394852, -0.0398311372846365,
+ -0.03137612994760275, -0.022138250060379505, -0.018067320343106985, -0.017910746857523918,
+ -0.017910745926201344, -0.01618842873722315, -0.007576846517622471, -0.007263698382303119,
+ -0.0010007291566580534, 0.0010347360512241721, 0.006514834007248282, 0.00933317095041275,
+ 0.012464655097573996, 0.019197346206055954, 0.020919663831591606, 0.02217225730419159,
+ 0.032036433927714825, 0.036420512944459915, 0.04080459102988243, 0.04127431474626064,
+ 0.04268348217010498, 0.04424922354519367, 0.04424922540783882, 0.056462014093995094, 0.05928034894168377,
+ 0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {},
+ 'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']}
+
+ for key in expexted_generalizations['ranges']:
+ assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+ for key in expexted_generalizations['categories']:
+ assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+ set([frozenset(sl) for sl in gener['categories'][key]]))
+ assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+ assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())
+
+ modified_features = [f for f in features if
+ f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+ 'ranges'].keys()]
+ indexes = []
+ for i in range(len(features)):
+ if features[i] in modified_features:
+ indexes.append(i)
+ assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
+ ncp = gen.ncp_
+ if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+ assert (ncp > 0)
+ assert (((transformed[indexes]) != (x_train[indexes])).any())
+
+
+def test_X_y(data):
+ features = [0, 1, 2]
+ X = np.array([[23, 165, 70],
+ [45, 158, 67],
+ [56, 123, 65],
+ [67, 154, 90],
+ [45, 149, 67],
+ [42, 166, 58],
+ [73, 172, 68],
+ [94, 168, 69],
+ [69, 175, 80],
+ [24, 181, 95],
+ [18, 190, 102]])
+ print(X)
+ y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+ QI = [0, 2]
+ base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
+ min_samples_leaf=1)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(X, y))
+ predictions = model.predict(X)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
+
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
+ gen.fit(X=X, y=predictions)
+ transformed = gen.transform(X)
+ gener = gen.generalizations_
+ expexted_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']}
+ for key in expexted_generalizations['ranges']:
+ assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+ for key in expexted_generalizations['categories']:
+ assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+ set([frozenset(sl) for sl in gener['categories'][key]]))
+ assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+ assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
+ modified_features = [f for f in features if
+ str(f) in expexted_generalizations['categories'].keys() or str(f) in expexted_generalizations[
+ 'ranges'].keys()]
+ indexes = []
+ for i in range(len(features)):
+ if features[i] in modified_features:
+ indexes.append(i)
+ assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
+ ncp = gen.ncp_
+ if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+ assert (ncp > 0)
+ assert (((transformed[indexes]) != (X[indexes])).any())
+
+
+def test_X_y_features_names(data):
+ features = ['age', 'height', 'weight']
+ X = np.array([[23, 165, 70],
+ [45, 158, 67],
+ [56, 123, 65],
+ [67, 154, 90],
+ [45, 149, 67],
+ [42, 166, 58],
+ [73, 172, 68],
+ [94, 168, 69],
+ [69, 175, 80],
+ [24, 181, 95],
+ [18, 190, 102]])
+ print(X)
+ y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+ QI = ['age', 'weight']
+ base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
+ min_samples_leaf=1)
+ model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+ model.fit(ArrayDataset(X, y))
+ predictions = model.predict(X)
+ if predictions.shape[1] > 1:
+ predictions = np.argmax(predictions, axis=1)
+
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
+ gen.fit(X=X, y=predictions, features_names=features)
+ transformed = gen.transform(X=X, features_names=features)
+ gener = gen.generalizations_
+ expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
+ for key in expexted_generalizations['ranges']:
+ assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+ for key in expexted_generalizations['categories']:
+ assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+ set([frozenset(sl) for sl in gener['categories'][key]]))
+ assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+ assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
+ modified_features = [f for f in features if
+ f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+ 'ranges'].keys()]
+ indexes = []
+ for i in range(len(features)):
+ if features[i] in modified_features:
+ indexes.append(i)
+ assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
+ ncp = gen.ncp_
+ if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+ assert (ncp > 0)
+ assert (((transformed[indexes]) != (X[indexes])).any())
+
+
+def test_BaseEstimator_classification(data):
+ features = ['age', 'height', 'weight', 'sex', 'ola']
+ X = [[23, 165, 65, 'f', 'aa'],
+ [45, 158, 76, 'f', 'aa'],
+ [56, 123, 78, 'f', 'bb'],
+ [67, 154, 87, 'm', 'aa'],
+ [45, 149, 45, 'f', 'bb'],
+ [42, 166, 76, 'm', 'bb'],
+ [73, 172, 85, 'm', 'bb'],
+ [94, 168, 92, 'f', 'aa'],
+ [69, 175, 95, 'm', 'aa'],
+ [24, 181, 49, 'm', 'bb'],
+ [18, 190, 69, 'm', 'bb']]
+
+ y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+ X = pd.DataFrame(X, columns=features)
+ QI = ['age', 'weight', 'ola']
+
+ numeric_features = ["age", "height", "weight"]
+ numeric_transformer = Pipeline(
+ steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
+ )
+
+ categorical_features = ["sex", "ola"]
+ categorical_transformer = OneHotEncoder(handle_unknown="ignore")
+
+ preprocessor = ColumnTransformer(
+ transformers=[
+ ("num", numeric_transformer, numeric_features),
+ ("cat", categorical_transformer, categorical_features),
+ ]
+ )
+ encoded = preprocessor.fit_transform(X)
+ encoded = pd.DataFrame(encoded)
+ base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
+ min_samples_leaf=1)
+ model = base_est
+ model.fit(encoded, y)
+ predictions = model.predict(encoded)
+
+ # Append classifier to preprocessing pipeline.
+ # Now we have a full prediction pipeline.
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
+ categorical_features=categorical_features, features_to_minimize=QI)
+ train_dataset = ArrayDataset(X, predictions)
+ gen.fit(dataset=train_dataset)
+ transformed = gen.transform(dataset=ArrayDataset(X))
+ gener = gen.generalizations_
+ expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
+ 'untouched': ['height', 'sex']}
+
+ for key in expexted_generalizations['ranges']:
+ assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+ for key in expexted_generalizations['categories']:
+ assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+ set([frozenset(sl) for sl in gener['categories'][key]]))
+ assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+ # assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
+ np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
+ modified_features = [f for f in features if
+ f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+ 'ranges'].keys()]
+ # assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
+ np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
+ ncp = gen.ncp_
+ if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+ assert (ncp > 0)
+ assert (((transformed[modified_features]).equals(X[modified_features])) == False)
+
+
+def test_BaseEstimator_regression():
+ dataset = load_diabetes()
+ x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
+
+ base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
+ model = base_est
+ model.fit(x_train, y_train)
+ predictions = model.predict(x_train)
+ QI = ['age', 'bmi', 's2', 's5']
+ features = ['age', 'sex', 'bmi', 'bp',
+ 's1', 's2', 's3', 's4', 's5', 's6']
+
+ gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
+ features_to_minimize=QI)
+ gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
+ transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
model.fit(transformed, y_train)
print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test))
@@ -546,7 +832,7 @@ def test_regression():
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
- assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
+ assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
diff --git a/tests/test_model.py b/tests/test_model.py
new file mode 100644
index 0000000..bbb951b
--- /dev/null
+++ b/tests/test_model.py
@@ -0,0 +1,35 @@
+import pytest
+
+from apt.utils.models import SklearnClassifier, SklearnRegressor, ModelOutputType
+from apt.utils.datasets import ArrayDataset
+from apt.utils import dataset_utils
+
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import RandomForestClassifier
+
+
+def test_sklearn_classifier():
+ (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset()
+ underlying_model = RandomForestClassifier()
+ model = SklearnClassifier(underlying_model, ModelOutputType.CLASSIFIER_VECTOR)
+ train = ArrayDataset(x_train, y_train)
+ test = ArrayDataset(x_test, y_test)
+ model.fit(train)
+ pred = model.predict(x_test)
+ assert(pred.shape[0] == x_test.shape[0])
+
+ score = model.score(test)
+ assert(0.0 <= score <= 1.0)
+
+
+def test_sklearn_regressor():
+ (x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset()
+ underlying_model = DecisionTreeRegressor()
+ model = SklearnRegressor(underlying_model)
+ train = ArrayDataset(x_train, y_train)
+ test = ArrayDataset(x_test, y_test)
+ model.fit(train)
+ pred = model.predict(x_test)
+ assert (pred.shape[0] == x_test.shape[0])
+
+ score = model.score(test)