diff --git a/apt/anonymization/anonymizer.py b/apt/anonymization/anonymizer.py index 9f82c7c..02854f5 100644 --- a/apt/anonymization/anonymizer.py +++ b/apt/anonymization/anonymizer.py @@ -8,6 +8,7 @@ from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.preprocessing import OneHotEncoder +from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE from typing import Union, Optional @@ -49,61 +50,64 @@ class Anonymize: self.categorical_features = categorical_features self.is_regression = is_regression self.train_only_QI = train_only_QI + self.features_names = None + self.features = None - def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \ - -> Union[np.ndarray, pd.DataFrame]: + def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE: """ Method for performing model-guided anonymization. - :param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and - categorical data. - :param y: The predictions of the original model on the training data. + :param dataset: Data wrapper containing the training data for the model and the predictions of the + original model on the training data. :return: An array containing the anonymized training dataset. """ - if type(x) == np.ndarray: - self.features = [i for i in range(x.shape[1])] - return self._anonymize_ndarray(x.copy(), y) - else: # pandas - self.features = x.columns - if not self.categorical_features: - raise ValueError('When supplying a pandas dataframe, categorical_features must be defined') - return self._anonymize_pandas(x.copy(), y) + if dataset.get_samples().shape[1] != 0: + self.features = [i for i in range(dataset.get_samples().shape[1])] + else: + raise ValueError('No data provided') - def _anonymize_ndarray(self, x, y): + if dataset.features_names is not None: + self.features_names = dataset.features_names + else: # if no names provided, use numbers instead + self.features_names = self.features + + if not set(self.quasi_identifiers).issubset(set(self.features_names)): + raise ValueError('Quasi identifiers should bs a subset of the supplied features or indexes in range of ' + 'the data columns') + if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)): + raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of ' + 'the data columns') + self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers] + if self.categorical_features: + self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features] + + transformed = self._anonymize(dataset.get_samples().copy(), dataset.get_labels()) + if dataset.is_pandas: + return pd.DataFrame(transformed, columns=self.features_names) + else: + return transformed + + def _anonymize(self, x, y): if x.shape[0] != y.shape[0]: raise ValueError("x and y should have same number of rows") - x_anonymizer_train = x - if self.train_only_QI: - # build DT just on QI features - x_anonymizer_train = x[:, self.quasi_identifiers] if x.dtype.kind not in 'iufc': - x_prepared = self._modify_categorical_features(x_anonymizer_train) + if not self.categorical_features: + raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined') + x_prepared = self._modify_categorical_features(x) else: - x_prepared = x_anonymizer_train - if self.is_regression: - self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k) - else: - self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k) - self.anonymizer.fit(x_prepared, y) - cells_by_id = self._calculate_cells(x, x_prepared) - return self._anonymize_data_numpy(x, x_prepared, cells_by_id) - - def _anonymize_pandas(self, x, y): - if x.shape[0] != y.shape[0]: - raise ValueError("x and y should have same number of rows") - x_anonymizer_train = x + x_prepared = x + x_anonymizer_train = x_prepared if self.train_only_QI: # build DT just on QI features - x_anonymizer_train = x.loc[:, self.quasi_identifiers] - # need to one-hot encode before training the decision tree - x_prepared = self._modify_categorical_features(x_anonymizer_train) + x_anonymizer_train = x_prepared[:, self.quasi_identifiers] if self.is_regression: self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k) else: self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k) - self.anonymizer.fit(x_prepared, y) - cells_by_id = self._calculate_cells(x, x_prepared) - return self._anonymize_data_pandas(x, x_prepared, cells_by_id) + + self.anonymizer.fit(x_anonymizer_train, y) + cells_by_id = self._calculate_cells(x, x_anonymizer_train) + return self._anonymize_data(x, x_anonymizer_train, cells_by_id) def _calculate_cells(self, x, x_anonymizer_train): # x is original data, x_anonymizer_train is only QIs + 1-hot encoded @@ -130,15 +134,9 @@ class Anonymize: # get all rows in cell indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']] # TODO: should we filter only those with majority label? (using hist) - if type(x) == np.ndarray: - rows = x[indexes] - else: # pandas - rows = x.iloc[indexes] + rows = x[indexes] for feature in self.quasi_identifiers: - if type(x) == np.ndarray: - values = rows[:, feature] - else: # pandas - values = rows.loc[:, feature] + values = rows[:, feature] if self.categorical_features and feature in self.categorical_features: # find most common value cell['representative'][feature] = Counter(values).most_common(1)[0][0] @@ -163,7 +161,7 @@ class Anonymize: node_ids = self._find_sample_nodes(samples) return [cells_by_id[node_id] for node_id in node_ids] - def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id): + def _anonymize_data(self, x, x_anonymizer_train, cells_by_id): cells = self._find_sample_cells(x_anonymizer_train, cells_by_id) index = 0 for row in x: @@ -173,22 +171,12 @@ class Anonymize: row[feature] = cell['representative'][feature] return x - def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id): - cells = self._find_sample_cells(x_anonymizer_train, cells_by_id) - index = 0 - for i, row in x.iterrows(): - cell = cells[index] - index += 1 - for feature in cell['representative']: - x.at[i, feature] = cell['representative'][feature] - return x - def _modify_categorical_features(self, x): # prepare data for DT used_features = self.features if self.train_only_QI: used_features = self.quasi_identifiers - numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features] + numeric_features = [f for f in self.features if f in used_features and f not in self.categorical_features] categorical_features = [f for f in self.categorical_features if f in used_features] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index d04cc03..27b6b6e 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -1,7 +1,7 @@ """ This module implements all classes needed to perform data minimization """ -from typing import Union +from typing import Union, Optional import pandas as pd import numpy as np import copy @@ -16,6 +16,9 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.model_selection import train_test_split +from apt.utils.datasets import ArrayDataset, Data, DATA_PANDAS_NUMPY_TYPE +from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnClassifier + class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin): """ A transformer that generalizes data to representative points. @@ -24,16 +27,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM and a target accuracy. Once the generalizations are learned, can receive one or more data records and transform them to representative points based on the learned generalization. - - An alternative way to use the transformer is to supply ``cells`` and - ``features`` in init or set_params and those will be used to transform + An alternative way to use the transformer is to supply ``cells`` in + init or set_params and those will be used to transform data to representatives. In this case, fit must still be called but there is no need to supply it with ``X`` and ``y``, and there is no need to supply an existing ``estimator`` to init. - In summary, either ``estimator`` and ``target_accuracy`` should be - supplied or ``cells`` and ``features`` should be supplied. - + supplied or ``cells`` should be supplied. Parameters ---------- estimator : estimator, optional @@ -43,8 +43,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM The required accuracy when applying the base model to the generalized data. Accuracy is measured relative to the original accuracy of the model. - features : list of str, optional - The feature names, in the order that they appear in the data. categorical_features: list of str, optional The list of categorical features should only be supplied when passing data as a pandas dataframe. @@ -67,28 +65,29 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM Attributes ---------- + features_ : list of str + The feature names, in the order that they appear in the data. cells_ : list of object The cells used to generalize records, as learned when calling fit. - ncp_ : float The NCP (information loss) score of the resulting generalization, as measured on the training data. - generalizations_ : object The generalizations that were learned (actual feature ranges). - - Notes - ----- - - """ - def __init__(self, estimator=None, target_accuracy=0.998, features=None, - cells=None, categorical_features=None, features_to_minimize: Union[np.ndarray, list] = None - , train_only_QI=True, is_regression=False): - self.estimator = estimator + def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998, + cells: list = None, categorical_features: Union[np.ndarray, list] = None, + features_to_minimize: Union[np.ndarray, list] = None, train_only_QI: bool = True, + is_regression: bool = False): + if issubclass(estimator.__class__, Model): + self.estimator = estimator + else: + if is_regression: + self.estimator = SklearnRegressor(estimator) + else: + self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_VECTOR) self.target_accuracy = target_accuracy - self.features = features self.cells = cells self.categorical_features = [] if categorical_features: @@ -114,11 +113,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM ret = {} ret['target_accuracy'] = self.target_accuracy if deep: - ret['features'] = copy.deepcopy(self.features) ret['cells'] = copy.deepcopy(self.cells) ret['estimator'] = self.estimator else: - ret['features'] = copy.copy(self.features) ret['cells'] = copy.copy(self.cells) return ret @@ -132,8 +129,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM """ if 'target_accuracy' in params: self.target_accuracy = params['target_accuracy'] - if 'features' in params: - self.features = params['features'] if 'cells' in params: self.cells = params['cells'] return self @@ -142,7 +137,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def generalizations(self): return self.generalizations_ - def fit_transform(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None): + def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None, + features_names: Optional = None, dataset: Optional[ArrayDataset] = None): """Learns the generalizations based on training data, and applies them to the data. Parameters @@ -152,17 +148,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM y : array-like, shape (n_samples,), optional The target values. An array of int. This should contain the predictions of the original model on ``X``. - + features_names : list of str, The feature names, in the order that they appear in the data, + provided just if X and y were provided (optional). + dataset : Data wrapper containing the training input samples and the predictions of the + original model on the training data. + Either X,y OR dataset need to be provided, not both. Returns ------- X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features) The array containing the representative values to which each record in ``X`` is mapped. """ - self.fit(X, y) - return self.transform(X) + self.fit(X, y, features_names, dataset=dataset) + return self.transform(X, features_names, dataset=dataset) - def fit(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None): + def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None, + features_names: Optional = None, dataset: ArrayDataset = None): """Learns the generalizations based on training data. Parameters @@ -172,7 +173,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM y : array-like, shape (n_samples,), optional The target values. An array of int. This should contain the predictions of the original model on ``X``. - + features_names : list of str, The feature names, in the order that they appear in the data, + provided just if X and y were provided (optional). + dataset : Data wrapper containing the training input samples and the predictions of the + original model on the training data. + Either X,y OR dataset need to be provided, not both. Returns ------- X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features) @@ -181,26 +186,25 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM """ # take into account that estimator, X, y, cells, features may be None - if X is not None: - if type(X) == np.ndarray: - self.is_numpy = True - else: - self.is_numpy = False - if X is not None and y is not None: - if self.is_numpy: - X, y = check_X_y(X, y, accept_sparse=True) - self.n_features_ = X.shape[1] - elif self.features: - self.n_features_ = len(self.features) + if dataset is not None: + raise ValueError('Either X,y OR dataset need to be provided, not both') + else: + dataset = ArrayDataset(X, y, features_names) + + if dataset and dataset.get_samples() is not None and dataset.get_labels() is not None: + self.n_features_ = dataset.get_samples().shape[1] + + elif dataset and dataset.features_names: + self.n_features_ = len(dataset.features_names) else: self.n_features_ = 0 - if self.features: - self._features = self.features + if dataset and dataset.features_names: + self._features = dataset.features_names # if features is None, use numbers instead of names elif self.n_features_ != 0: - self._features = [i for i in range(self.n_features_)] + self._features = [str(i) for i in range(self.n_features_)] else: self._features = None @@ -212,27 +216,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # Going to fit # (currently not dealing with option to fit with only X and y and no estimator) - if self.estimator and X is not None and y is not None: + if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None: + x = pd.DataFrame(dataset.get_samples(), columns=self._features) + if not self.features_to_minimize: + self.features_to_minimize = self._features + self.features_to_minimize = [str(i) for i in self.features_to_minimize] + if not all(elem in self._features for elem in self.features_to_minimize): + raise ValueError('features to minimize should be a subset of features names') + x_QI = x.loc[:, self.features_to_minimize] - if self.is_numpy: - if not self.features_to_minimize: - self.features_to_minimize = [i for i in range(len(self._features))] - x_QI = X[:, self.features_to_minimize] - self.features_to_minimize = [self._features[i] for i in self.features_to_minimize] - X = pd.DataFrame(X, columns=self._features) - else: - if not self.features_to_minimize: - self.features_to_minimize = self._features - x_QI = X.loc[:, self.features_to_minimize] - x_QI = pd.DataFrame(x_QI, columns=self.features_to_minimize) # divide dataset into train and test - used_data = X + used_data = x if self.train_only_QI: used_data = x_QI if self.is_regression: - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=14) + X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4, random_state=14) else: - X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=18) + X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), stratify=dataset.get_labels(), test_size=0.4, + random_state=18) X_train_QI = X_train.loc[:, self.features_to_minimize] X_test_QI = X_test.loc[:, self.features_to_minimize] @@ -246,7 +247,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM for feature in self._features: if feature not in feature_data.keys(): fd = {} - values = list(X.loc[:, feature]) + values = list(x.loc[:, feature]) if feature not in self.categorical_features: fd['min'] = min(values) fd['max'] = max(values) @@ -259,7 +260,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM categorical_features = [f for f in self._features if f in self.categorical_features and f in self.features_to_minimize] - numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] ) @@ -288,7 +288,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM ("cat", categorical_transformer, self.categorical_features), ] ) - preprocessor.fit(X) + preprocessor.fit(x) x_prepared = preprocessor.transform(X_train) if self.train_only_QI: x_prepared = preprocessor_QI_features.transform(X_train_QI) @@ -300,7 +300,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.dt_ = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1) else: self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2, - min_samples_leaf=1) + min_samples_leaf=1) self.dt_.fit(x_prepared, y_train) self._modify_categorical_features(used_data) @@ -329,7 +329,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_) # check accuracy - accuracy = self.estimator.score(preprocessor.transform(generalized), y_test) + accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test)) print('Initial accuracy of model on generalized data, relative to original model predictions ' '(base generalization derived from tree, before improvements): %f' % accuracy) @@ -349,7 +349,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._calculate_generalizations() generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_) - accuracy = self.estimator.score(preprocessor.transform(generalized), y_test) + accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test)) # if accuracy passed threshold roll back to previous iteration generalizations if accuracy < self.target_accuracy: self.cells_ = cells_previous_iter @@ -375,7 +375,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._calculate_generalizations() generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_) - accuracy = self.estimator.score(preprocessor.transform(generalized), y_test) + accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test)) print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy)) # self.cells_ currently holds the chosen generalization based on target accuracy @@ -386,7 +386,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # Return the transformer return self - def transform(self, X: Union[np.ndarray, pd.DataFrame]): + def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None): """ Transforms data records to representative points. Parameters @@ -394,6 +394,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe, may contain both numeric and categorical data. The input samples. + features_names : list of str, The feature names, in the order that they appear in the data, + provided just if X was provided (optional). + dataset : Data wrapper containing the training input samples. + Either X OR dataset need to be provided, not both. Returns ------- X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features) @@ -405,26 +409,30 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM msg = 'This %(name)s instance is not initialized yet. ' \ 'Call ‘fit’ or ‘set_params’ with ' \ 'appropriate arguments before using this method.' - check_is_fitted(self, ['cells', 'features'], msg=msg) + check_is_fitted(self, ['cells'], msg=msg) - if type(X) == np.ndarray: - # Input validation - X = check_array(X, accept_sparse=True) - self.is_numpy = True - X = pd.DataFrame(X, columns=self._features) - else: - self.is_numpy = False + if X is not None: + if dataset is not None: + raise ValueError('Either X OR dataset need to be provided, not both') + else: + dataset = ArrayDataset(X, features_names=features_names) + elif dataset is None: + raise ValueError('Either X OR dataset need to be provided, not both') + if dataset and dataset.features_names: + self._features = dataset.features_names + if dataset and dataset.get_samples() is not None: + x = pd.DataFrame(dataset.get_samples(), columns=self._features) - if X.shape[1] != self.n_features_ and self.n_features_ != 0: + if x.shape[1] != self.n_features_ and self.n_features_ != 0: raise ValueError('Shape of input is different from what was seen' 'in `fit`') if not self._features: - self._features = [i for i in range(X.shape[1])] + self._features = [i for i in range(x.shape[1])] representatives = pd.DataFrame(columns=self._features) # only columns - generalized = pd.DataFrame(X, columns=self._features, copy=True) # original data - mapped = np.zeros(X.shape[0]) # to mark records we already mapped + generalized = pd.DataFrame(x, columns=self._features, copy=True) # original data + mapped = np.zeros(x.shape[0]) # to mark records we already mapped # iterate over cells (leaves in decision tree) for i in range(len(self.cells_)): @@ -443,7 +451,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM representatives = representatives.drop(feature, axis=1) # get the indexes of all records that map to this cell - indexes = self._get_record_indexes_for_cell(X, self.cells_[i], mapped) + indexes = self._get_record_indexes_for_cell(x, self.cells_[i], mapped) # replace the values in the representative columns with the representative # values (leaves others untouched) @@ -454,9 +462,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM replace = representatives.loc[i].to_frame().T.reset_index(drop=True) replace.index = indexes generalized.loc[indexes, representatives.columns] = replace - if self.is_numpy: - return generalized.to_numpy() - return generalized + if dataset and dataset.is_pandas: + return generalized + elif isinstance(X, pd.DataFrame): + return generalized + return generalized.to_numpy() def _get_record_indexes_for_cell(self, X, cell, mapped): indexes = [] @@ -640,7 +650,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # else: nothing to do, stay with previous cells def _calculate_level_cell_label(self, left_cell, right_cell, new_cell): - new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else [] + new_cell['hist'] = [x + y for x, y in + zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else [] new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1 def _get_nodes_level(self, level): @@ -797,8 +808,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM cells_by_id = copy.deepcopy(self.cells_by_id_) GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id) - accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized), - labels) - current_accuracy + accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized), + labels)) - current_accuracy if accuracy_gain < 0: accuracy_gain = 0 if accuracy_gain != 0: @@ -820,8 +831,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM cells_by_id = copy.deepcopy(self.cells_by_id_) GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id) - accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized), - labels) - current_accuracy + accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized), + labels)) - current_accuracy if accuracy_gain < 0: accuracy_gain = 0 diff --git a/apt/utils/__init__.py b/apt/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apt/utils.py b/apt/utils/dataset_utils.py similarity index 93% rename from apt/utils.py rename to apt/utils/dataset_utils.py index bc73cbc..e3eb959 100644 --- a/apt/utils.py +++ b/apt/utils/dataset_utils.py @@ -13,8 +13,7 @@ def _load_iris(test_set_size: float = 0.3): # Split training and test sets x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size, - random_state=18, stratify=labels, - shuffle=True) + random_state=18, stratify=labels) return (x_train, y_train), (x_test, y_test) @@ -29,6 +28,28 @@ def get_iris_dataset(test_set: float = 0.3): return _load_iris(test_set) +def _load_diabetes(test_set_size: float = 0.3): + diabetes = datasets.load_diabetes() + data = diabetes.data + labels = diabetes.target + + # Split training and test sets + x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size, + random_state=18) + + return (x_train, y_train), (x_test, y_test) + + +def get_diabetes_dataset(): + """ + Loads the Iris dataset from scikit-learn. + + :param test_set: Proportion of the data to use as validation split (value between 0 and 1). + :return: Entire dataset and labels as numpy array. + """ + return _load_diabetes() + + def get_german_credit_dataset(test_set: float = 0.3): """ Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary. @@ -253,7 +274,7 @@ def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_socia raise Exception("Bad label value: %s" % value) data["label"] = data["label"].apply(modify_label) - data["children"] = data["children"].apply(lambda x: 4 if x == "more" else x) + data["children"] = data["children"].apply(lambda x: "4" if x == "more" else x) if transform_social: diff --git a/apt/utils/datasets/__init__.py b/apt/utils/datasets/__init__.py new file mode 100644 index 0000000..6e7c640 --- /dev/null +++ b/apt/utils/datasets/__init__.py @@ -0,0 +1,7 @@ +""" +The AI Privacy Toolbox (datasets). +Implementation of datasets utility components for datasets creation, load, and store +""" + +from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, \ + OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE diff --git a/apt/utils/datasets/datasets.py b/apt/utils/datasets/datasets.py new file mode 100644 index 0000000..ff7c296 --- /dev/null +++ b/apt/utils/datasets/datasets.py @@ -0,0 +1,320 @@ +# !/usr/bin/env python +""" +The AI Privacy Toolbox (datasets). +Implementation of utility classes for dataset handling +""" + +from abc import ABCMeta, abstractmethod +from typing import Callable, Collection, Any, Union, List, Optional + +import tarfile +import os +import urllib.request +import numpy as np +import pandas as pd +import logging +import torch +from torch import Tensor + +logger = logging.getLogger(__name__) + + +INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor] +OUTPUT_DATA_ARRAY_TYPE = np.ndarray +DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame] + + +def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE: + + """ + converts from INPUT_DATA_ARRAY_TYPE to numpy array + """ + if type(arr) == np.ndarray: + return arr + if type(arr) == pd.DataFrame or type(arr) == pd.Series: + self.is_pandas = True + return arr.to_numpy() + if isinstance(arr, list): + return np.array(arr) + if type(arr) == Tensor: + return arr.detach().cpu().numpy() + + raise ValueError('Non supported type: ', type(arr).__name__) + + +def array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor: + """ + converts from INPUT_DATA_ARRAY_TYPE to torch tensor array + """ + if type(arr) == np.ndarray: + return torch.from_numpy(arr) + if type(arr) == pd.DataFrame or type(arr) == pd.Series: + self.is_pandas = True + return torch.from_numpy(arr.to_numpy()) + if isinstance(arr, list): + return torch.tensor(arr) + if type(arr) == Tensor: + return arr + + raise ValueError('Non supported type: ', type(arr).__name__) + + +class Dataset(metaclass=ABCMeta): + """Base Abstract Class for Dataset""" + + @abstractmethod + def __init__(self, **kwargs): + pass + + @abstractmethod + def get_samples(self) -> Collection[Any]: + """Return data samples""" + pass + + @abstractmethod + def get_labels(self) -> Collection[Any]: + """Return labels""" + pass + + +class StoredDataset(Dataset): + """Abstract Class for Storable Dataset""" + + @abstractmethod + def load_from_file(self, path: str): + """Load dataset from file""" + pass + + @abstractmethod + def load(self, **kwargs): + """Load dataset""" + pass + + @staticmethod + def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None: + """ + Download the dataset from URL + :param url: dataset URL, the dataset will be requested from this URL + :param dest_path: local dataset destination path + :param filename: local dataset filename + :param unzip: flag whether or not perform extraction + :return: None + """ + file_path = os.path.join(dest_path, filename) + + if os.path.exists(file_path): + logger.warning("Files already downloaded, skipping downloading") + + else: + os.makedirs(dest_path, exist_ok=True) + logger.info("Downloading the dataset...") + urllib.request.urlretrieve(url, file_path) + logger.info('Dataset Downloaded') + + if unzip: + StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False) + + @staticmethod + def extract_archive(zip_path: str, dest_path=None, remove_archive=False): + """ + Extract dataset from archived file + :param zip_path: path to archived file + :param dest_path: directory path to uncompress the file to + :param remove_archive: whether remove the archive file after uncompress (default False) + :return: None + """ + logger.info("Extracting the dataset...") + tar = tarfile.open(zip_path) + tar.extractall(path=dest_path) + + logger.info("Dataset was extracted to {}".format(dest_path)) + if remove_archive: + logger.info("Removing a zip file") + os.remove(zip_path) + logger.info("Extracted the dataset") + + @staticmethod + def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None: + """ + Split the data and take only a part of it + :param datafile: dataset file path + :param dest_datafile: destination path for the partial dataset file + :param ratio: part of the dataset to save + :param shuffle: whether to shuffle the data or not (default True) + :param delimiter: dataset delimiter (default ",") + :param fmt: format for the correct data saving + :return: None + """ + if os.path.isfile(dest_datafile): + logger.info(f"The partial debug split already exists {dest_datafile}") + return + else: + os.makedirs(os.path.dirname(dest_datafile), exist_ok=True) + + data = np.genfromtxt(datafile, delimiter=delimiter) + if shuffle: + logger.info("Shuffling data") + np.random.shuffle(data) + + debug_data = data[:int(len(data) * ratio)] + logger.info(f"Saving {ratio} of the data to {dest_datafile}") + np.savetxt(dest_datafile, debug_data, delimiter=delimiter, fmt=fmt) + + +class ArrayDataset(Dataset): + """Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)""" + + def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, + features_names: Optional = None, **kwargs): + """ + ArrayDataset constructor. + :param x: collection of data samples + :param y: collection of labels (optional) + :param feature_names: list of str, The feature names, in the order that they appear in the data (optional) + :param kwargs: dataset parameters + """ + self.is_pandas = False + self.features_names = features_names + self._y = array2numpy(self, y) if y is not None else None + self._x = array2numpy(self, x) + if self.is_pandas: + if features_names and not np.array_equal(features_names, x.columns): + raise ValueError("The supplied features are not the same as in the data features") + self.features_names = x.columns.to_list() + + if y is not None and len(self._x) != len(self._y): + raise ValueError('Non equivalent lengths of x and y') + + def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE: + """Return data samples as numpy array""" + return self._x + + def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE: + """Return labels as numpy array""" + return self._y + + +class PytorchData(Dataset): + + def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs): + """ + PytorchData constructor. + :param x: collection of data samples + :param y: collection of labels (optional) + :param kwargs: dataset parameters + """ + self.is_pandas = False + self._y = array2torch_tensor(self, y) if y is not None else None + self._x = array2torch_tensor(self, x) + if self.is_pandas: + self.features_names = x.columns + + if y is not None and len(self._x) != len(self._y): + raise ValueError('Non equivalent lengths of x and y') + + + if self._y is not None: + self.__getitem__ = self.get_item + else: + self.__getitem__ = self.get_sample_item + + + def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE: + """Return data samples as numpy array""" + return array2numpy(self._x) + + def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE: + """Return labels as numpy array""" + return array2numpy(self._y) if self._y is not None else None + + def get_sample_item(self, idx) -> Tensor: + return self.x[idx] + + def get_item(self, idx) -> Tensor: + sample, label = self.x[idx], self.y[idx] + return sample, label + + def __len__(self): + return len(self.x) + + +class DatasetFactory: + """Factory class for dataset creation""" + registry = {} + + @classmethod + def register(cls, name: str) -> Callable: + """ + Class method to register Dataset to the internal registry + :param name: dataset name + :return: + """ + + def inner_wrapper(wrapped_class: Dataset) -> Any: + if name in cls.registry: + logger.warning('Dataset %s already exists. Will replace it', name) + cls.registry[name] = wrapped_class + return wrapped_class + + return inner_wrapper + + @classmethod + def create_dataset(cls, name: str, **kwargs) -> Dataset: + """ + Factory command to create dataset instance. + This method gets the appropriate Dataset class from the registry + and creates an instance of it, while passing in the parameters + given in ``kwargs``. + :param name: The name of the dataset to create. + :param kwargs: dataset parameters + :return: An instance of the dataset that is created. + """ + if name not in cls.registry: + msg = f'Dataset {name} does not exist in the registry' + logger.error(msg) + raise ValueError(msg) + + exec_class = cls.registry[name] + executor = exec_class(**kwargs) + return executor + + +class Data: + def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs): + """ + Data class constructor. + The class stores train and test datasets. + If neither of the datasets was provided, + Both train and test datasets will be create using + DatasetFactory to create a dataset instance + """ + if train or test: + self.train = train + self.test = test + else: + self.train = DatasetFactory.create_dataset(train=True, **kwargs) + self.test = DatasetFactory.create_dataset(train=False, **kwargs) + + def get_train_set(self) -> Dataset: + """Return train DatasetBase""" + return self.train + + def get_test_set(self) -> Dataset: + """Return test DatasetBase""" + return self.test + + def get_train_samples(self) -> Collection[Any]: + """Return train set samples""" + return self.train.get_samples() + + def get_train_labels(self) -> Collection[Any]: + """Return train set labels""" + return self.train.get_labels() + + def get_test_samples(self) -> Collection[Any]: + """Return test set samples""" + return self.test.get_samples() + + def get_test_labels(self) -> Collection[Any]: + """Return test set labels""" + return self.test.get_labels() diff --git a/apt/utils/models/__init__.py b/apt/utils/models/__init__.py new file mode 100644 index 0000000..11efd5f --- /dev/null +++ b/apt/utils/models/__init__.py @@ -0,0 +1,2 @@ +from apt.utils.models.model import Model, ModelOutputType +from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor diff --git a/apt/utils/models/model.py b/apt/utils/models/model.py new file mode 100644 index 0000000..9616459 --- /dev/null +++ b/apt/utils/models/model.py @@ -0,0 +1,109 @@ +from abc import ABCMeta, abstractmethod +from typing import Any, Optional +from enum import Enum, auto + +from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE + + +class ModelOutputType(Enum): + CLASSIFIER_VECTOR = auto() # probabilities or logits + CLASSIFIER_SCALAR = auto() # label only + REGRESSOR_SCALAR = auto() # value + + +class Model(metaclass=ABCMeta): + """ + Abstract base class for ML model wrappers. + """ + + def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True, + unlimited_queries: Optional[bool] = True, **kwargs): + """ + Initialize a `Model` wrapper object. + + :param model: The original model object (of the underlying ML framework) + :param output_type: The type of output the model yields (vector/label only for classifiers, + value for regressors) + :param black_box_access: Boolean describing the type of deployment of the model (when in production). + Set to True if the model is only available via query (API) access, i.e., + only the outputs of the model are exposed, and False if the model internals + are also available. Optional, Default is True. + :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform + unlimited queries to the model API or whether there is a limit to the number of + queries that can be submitted. Optional, Default is True. + """ + self._model = model + self._output_type = output_type + self._black_box_access = black_box_access + self._unlimited_queries = unlimited_queries + + @abstractmethod + def fit(self, train_data: Dataset, **kwargs) -> None: + """ + Fit the model using the training data. + + :param train_data: Training data. + :type train_data: `Dataset` + """ + raise NotImplementedError + + @abstractmethod + def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE: + """ + Perform predictions using the model for input `x`. + + :param x: Input samples. + :type x: `np.ndarray` or `pandas.DataFrame` + :return: Predictions from the model. + """ + raise NotImplementedError + + @abstractmethod + def score(self, test_data: Dataset, **kwargs): + """ + Score the model using test data. + + :param test_data: Test data. + :type train_data: `Dataset` + """ + return NotImplementedError + + @property + def model(self) -> Any: + """ + Return the model. + + :return: The model. + """ + return self._model + + @property + def output_type(self) -> ModelOutputType: + """ + Return the model's output type. + + :return: The model's output type. + """ + return self._output_type + + @property + def black_box_access(self) -> bool: + """ + Return True if the model is only available via query (API) access, i.e., + only the outputs of the model are exposed, and False if the model internals are also available. + + :return: True if the model is only available via query (API) access, i.e., + only the outputs of the model are exposed, and False if the model internals are also available. + """ + return self._black_box_access + + @property + def unlimited_queries(self) -> bool: + """ + If black_box_access is True, Return whether a user can perform unlimited queries to the model API + or whether there is a limit to the number of queries that can be submitted. + + :return: If black_box_access is True, Return whether a user can perform unlimited queries to the model API + or whether there is a limit to the number of queries that can be submitted. + """ + return self._unlimited_queries diff --git a/apt/utils/models/sklearn_model.py b/apt/utils/models/sklearn_model.py new file mode 100644 index 0000000..f7afaa6 --- /dev/null +++ b/apt/utils/models/sklearn_model.py @@ -0,0 +1,112 @@ +from typing import Optional + +import numpy as np + +from sklearn.preprocessing import OneHotEncoder +from sklearn.base import BaseEstimator + +from apt.utils.models import Model, ModelOutputType +from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE + +from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier +from art.estimators.regression.scikitlearn import ScikitlearnRegressor + + +class SklearnModel(Model): + """ + Wrapper class for scikitlearn models. + """ + def score(self, test_data: Dataset, **kwargs): + """ + Score the model using test data. + + :param test_data: Test data. + :type train_data: `Dataset` + """ + return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs) + + +class SklearnClassifier(SklearnModel): + """ + Wrapper class for scikitlearn classification models. + """ + def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True, + unlimited_queries: Optional[bool] = True, **kwargs): + """ + Initialize a `SklearnClassifier` wrapper object. + + :param model: The original sklearn model object. + :param output_type: The type of output the model yields (vector/label only for classifiers, + value for regressors) + :param black_box_access: Boolean describing the type of deployment of the model (when in production). + Set to True if the model is only available via query (API) access, i.e., + only the outputs of the model are exposed, and False if the model internals + are also available. Optional, Default is True. + :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform + unlimited queries to the model API or whether there is a limit to the number of + queries that can be submitted. Optional, Default is True. + """ + super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs) + self._art_model = ArtSklearnClassifier(model) + + def fit(self, train_data: Dataset, **kwargs) -> None: + """ + Fit the model using the training data. + + :param train_data: Training data. + :type train_data: `Dataset` + """ + encoder = OneHotEncoder(sparse=False) + y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1)) + self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs) + + def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE: + """ + Perform predictions using the model for input `x`. + + :param x: Input samples. + :type x: `np.ndarray` or `pandas.DataFrame` + :return: Predictions from the model (class probabilities, if supported). + """ + return self._art_model.predict(x, **kwargs) + + +class SklearnRegressor(SklearnModel): + """ + Wrapper class for scikitlearn regression models. + """ + def __init__(self, model: BaseEstimator, black_box_access: Optional[bool] = True, + unlimited_queries: Optional[bool] = True, **kwargs): + """ + Initialize a `SklearnRegressor` wrapper object. + + :param model: The original sklearn model object. + :param black_box_access: Boolean describing the type of deployment of the model (when in production). + Set to True if the model is only available via query (API) access, i.e., + only the outputs of the model are exposed, and False if the model internals + are also available. Optional, Default is True. + :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform + unlimited queries to the model API or whether there is a limit to the number of + queries that can be submitted. Optional, Default is True. + """ + super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs) + self._art_model = ScikitlearnRegressor(model) + + def fit(self, train_data: Dataset, **kwargs) -> None: + """ + Fit the model using the training data. + + :param train_data: Training data. + :type train_data: `Dataset` + """ + self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs) + + def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE: + """ + Perform predictions using the model for input `x`. + + :param x: Input samples. + :type x: `np.ndarray` or `pandas.DataFrame` + :return: Predictions from the model. + """ + return self._art_model.predict(x, **kwargs) diff --git a/notebooks/attribute_inference_anonymization_nursery.ipynb b/notebooks/attribute_inference_anonymization_nursery.ipynb index 9952885..bfba540 100644 --- a/notebooks/attribute_inference_anonymization_nursery.ipynb +++ b/notebooks/attribute_inference_anonymization_nursery.ipynb @@ -29,198 +29,15 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 1, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
parentshas_nursformchildrenhousingfinancesocialhealth
8450pretentiousvery_critfoster1less_convconvenient1not_recom
12147great_pretvery_critcomplete1criticalinconv1recommended
2780usualcriticalcomplete4less_convconvenient1not_recom
11924great_pretcriticalfoster1criticalconvenient1not_recom
59usualpropercomplete2convenientconvenient0not_recom
...........................
5193pretentiousless_propercomplete1convenientinconv0recommended
1375usualless_properincomplete2less_convconvenient1priority
10318great_pretless_properfoster4convenientconvenient0priority
6396pretentiousimpropercompleted3less_convconvenient1recommended
485usualproperincomplete1criticalinconv1not_recom
\n", - "

10366 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " parents has_nurs form children housing finance \\\n", - "8450 pretentious very_crit foster 1 less_conv convenient \n", - "12147 great_pret very_crit complete 1 critical inconv \n", - "2780 usual critical complete 4 less_conv convenient \n", - "11924 great_pret critical foster 1 critical convenient \n", - "59 usual proper complete 2 convenient convenient \n", - "... ... ... ... ... ... ... \n", - "5193 pretentious less_proper complete 1 convenient inconv \n", - "1375 usual less_proper incomplete 2 less_conv convenient \n", - "10318 great_pret less_proper foster 4 convenient convenient \n", - "6396 pretentious improper completed 3 less_conv convenient \n", - "485 usual proper incomplete 1 critical inconv \n", - "\n", - " social health \n", - "8450 1 not_recom \n", - "12147 1 recommended \n", - "2780 1 not_recom \n", - "11924 1 not_recom \n", - "59 0 not_recom \n", - "... ... ... \n", - "5193 0 recommended \n", - "1375 1 priority \n", - "10318 0 priority \n", - "6396 1 recommended \n", - "485 1 not_recom \n", - "\n", - "[10366 rows x 8 columns]" - ] + "text/plain": " parents has_nurs form children housing finance \\\n8450 pretentious very_crit foster 1 less_conv convenient \n12147 great_pret very_crit complete 1 critical inconv \n2780 usual critical complete 4 less_conv convenient \n11924 great_pret critical foster 1 critical convenient \n59 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n5193 pretentious less_proper complete 1 convenient inconv \n1375 usual less_proper incomplete 2 less_conv convenient \n10318 great_pret less_proper foster 4 convenient convenient \n6396 pretentious improper completed 3 less_conv convenient \n485 usual proper incomplete 1 critical inconv \n\n social health \n8450 1 not_recom \n12147 1 recommended \n2780 1 not_recom \n11924 1 not_recom \n59 0 not_recom \n... ... ... \n5193 0 recommended \n1375 1 priority \n10318 0 priority \n6396 1 recommended \n485 1 not_recom \n\n[10366 rows x 8 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
parentshas_nursformchildrenhousingfinancesocialhealth
8450pretentiousvery_critfoster1less_convconvenient1not_recom
12147great_pretvery_critcomplete1criticalinconv1recommended
2780usualcriticalcomplete4less_convconvenient1not_recom
11924great_pretcriticalfoster1criticalconvenient1not_recom
59usualpropercomplete2convenientconvenient0not_recom
...........................
5193pretentiousless_propercomplete1convenientinconv0recommended
1375usualless_properincomplete2less_convconvenient1priority
10318great_pretless_properfoster4convenientconvenient0priority
6396pretentiousimpropercompleted3less_convconvenient1recommended
485usualproperincomplete1criticalinconv1not_recom
\n

10366 rows × 8 columns

\n
" }, - "execution_count": 61, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -230,7 +47,7 @@ "import sys\n", "sys.path.insert(0, os.path.abspath('..'))\n", "\n", - "from apt.utils import get_nursery_dataset\n", + "from apt.utils.dataset_utils import get_nursery_dataset\n", "\n", "(x_train, y_train), (x_test, y_test) = get_nursery_dataset(transform_social=True)\n", "\n", @@ -246,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -263,9 +80,9 @@ "from sklearn.preprocessing import OneHotEncoder\n", "\n", "x_train_str = x_train.astype(str)\n", - "train_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_train_str)\n", + "train_encoded = OneHotEncoder(sparse=False).fit_transform(x_train_str)\n", "x_test_str = x_test.astype(str)\n", - "test_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_test_str)\n", + "test_encoded = OneHotEncoder(sparse=False).fit_transform(x_test_str)\n", " \n", "model = DecisionTreeClassifier()\n", "model.fit(train_encoded, y_train)\n", @@ -287,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -323,14 +140,14 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.6430638626278217\n" + "1.0\n" ] } ], @@ -361,14 +178,14 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.6980513216284006\n" + "0.5122515917422342\n" ] } ], @@ -408,224 +225,43 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
parentshas_nursformchildrenhousingfinancesocialhealth
8450pretentiousvery_critfoster1less_convconvenient0not_recom
12147great_pretvery_critcomplete1criticalinconv1recommended
2780usualcriticalcomplete4less_convconvenient0not_recom
11924great_pretcriticalfoster1criticalconvenient0not_recom
59usualpropercomplete2convenientconvenient0not_recom
...........................
5193pretentiousless_propercomplete1convenientinconv0recommended
1375usualless_properincomplete2less_convconvenient1priority
10318great_pretless_properfoster4convenientconvenient0priority
6396pretentiousimpropercompleted3less_convconvenient1recommended
485usualproperincomplete1criticalconvenient0not_recom
\n", - "

10366 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " parents has_nurs form children housing finance \\\n", - "8450 pretentious very_crit foster 1 less_conv convenient \n", - "12147 great_pret very_crit complete 1 critical inconv \n", - "2780 usual critical complete 4 less_conv convenient \n", - "11924 great_pret critical foster 1 critical convenient \n", - "59 usual proper complete 2 convenient convenient \n", - "... ... ... ... ... ... ... \n", - "5193 pretentious less_proper complete 1 convenient inconv \n", - "1375 usual less_proper incomplete 2 less_conv convenient \n", - "10318 great_pret less_proper foster 4 convenient convenient \n", - "6396 pretentious improper completed 3 less_conv convenient \n", - "485 usual proper incomplete 1 critical convenient \n", - "\n", - " social health \n", - "8450 0 not_recom \n", - "12147 1 recommended \n", - "2780 0 not_recom \n", - "11924 0 not_recom \n", - "59 0 not_recom \n", - "... ... ... \n", - "5193 0 recommended \n", - "1375 1 priority \n", - "10318 0 priority \n", - "6396 1 recommended \n", - "485 0 not_recom \n", - "\n", - "[10366 rows x 8 columns]" - ] + "text/plain": " parents has_nurs form children housing finance \\\n0 pretentious very_crit foster 1 less_conv convenient \n1 great_pret very_crit complete 1 critical inconv \n2 usual critical complete 4 less_conv convenient \n3 great_pret critical foster 1 critical convenient \n4 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n10361 pretentious less_proper complete 1 convenient inconv \n10362 usual less_proper incomplete 2 less_conv convenient \n10363 great_pret less_proper foster 4 convenient convenient \n10364 pretentious improper completed 3 less_conv convenient \n10365 usual proper incomplete 1 critical convenient \n\n social health \n0 0 not_recom \n1 1 recommended \n2 0 not_recom \n3 0 not_recom \n4 0 not_recom \n... ... ... \n10361 0 recommended \n10362 1 priority \n10363 0 priority \n10364 1 recommended \n10365 0 not_recom \n\n[10366 rows x 8 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
parentshas_nursformchildrenhousingfinancesocialhealth
0pretentiousvery_critfoster1less_convconvenient0not_recom
1great_pretvery_critcomplete1criticalinconv1recommended
2usualcriticalcomplete4less_convconvenient0not_recom
3great_pretcriticalfoster1criticalconvenient0not_recom
4usualpropercomplete2convenientconvenient0not_recom
...........................
10361pretentiousless_propercomplete1convenientinconv0recommended
10362usualless_properincomplete2less_convconvenient1priority
10363great_pretless_properfoster4convenientconvenient0priority
10364pretentiousimpropercompleted3less_convconvenient1recommended
10365usualproperincomplete1criticalconvenient0not_recom
\n

10366 rows × 8 columns

\n
" }, - "execution_count": 97, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "from apt.utils.datasets import ArrayDataset\n", "from apt.anonymization import Anonymize\n", "\n", + "features = x_train.columns\n", "QI = [\"finance\", \"social\", \"health\"]\n", "categorical_features = [\"parents\", \"has_nurs\", \"form\", \"housing\", \"finance\", \"health\", 'children']\n", - "anonymizer = Anonymize(100, QI, categorical_features=categorical_features)\n", - "anon = anonymizer.anonymize(x_train, x_train_predictions)\n", - "anon" + "QI_indexes = [i for i, v in enumerate(features) if v in QI]\n", + "categorical_features_indexes = [i for i, v in enumerate(features) if v in categorical_features]\n", + "anonymizer = Anonymize(100, QI_indexes, categorical_features=categorical_features_indexes)\n", + "anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n", + "anon\n" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "7585" - ] + "text/plain": "7585" }, - "execution_count": 64, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -637,16 +273,14 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "5766" - ] + "text/plain": "5766" }, - "execution_count": 65, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -665,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -678,7 +312,7 @@ ], "source": [ "anon_str = anon.astype(str)\n", - "anon_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon_str)\n", + "anon_encoded = OneHotEncoder(sparse=False).fit_transform(anon_str)\n", "\n", "anon_model = DecisionTreeClassifier()\n", "anon_model.fit(anon_encoded, y_train)\n", @@ -698,14 +332,14 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.6471155701331275\n" + "1.0\n" ] } ], @@ -734,14 +368,14 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.6982442600810341\n" + "0.5245996527107852\n" ] } ], @@ -765,15 +399,15 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(0.33056202194878614, 0.2888695146759663)\n", - "(0.34112301200908796, 0.3054344667247893)\n" + "(0.49415432579890883, 0.48976438779451525)\n", + "(0.49415432579890883, 0.48976438779451525)\n" ] } ], @@ -810,15 +444,15 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(0.6457357075913777, 0.2002324905550712)\n", - "(0.6472248353715898, 0.1999418773612322)\n" + "(1.0, 0.019204655674102813)\n", + "(0.9829787234042553, 0.04481086323957323)\n" ] } ], @@ -849,26 +483,24 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "anonymizer2 = Anonymize(1000, QI, categorical_features=categorical_features)\n", - "anon2 = anonymizer2.anonymize(x_train, x_train_predictions)" + "anonymizer2 = Anonymize(1000, QI_indexes, categorical_features=categorical_features_indexes)\n", + "anon2 = anonymizer2.anonymize(ArrayDataset(x_train, x_train_predictions))" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "4226" - ] + "text/plain": "4226" }, - "execution_count": 75, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -887,7 +519,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -900,7 +532,7 @@ ], "source": [ "anon2_str = anon2.astype(str)\n", - "anon2_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon2_str)\n", + "anon2_encoded = OneHotEncoder(sparse=False).fit_transform(anon2_str)\n", "\n", "anon2_model = DecisionTreeClassifier()\n", "anon2_model.fit(anon2_encoded, y_train)\n", @@ -920,14 +552,14 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.6266640941539648\n" + "1.0\n" ] } ], @@ -956,14 +588,14 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.6944819602546788\n" + "0.515820953115956\n" ] } ], @@ -980,17 +612,17 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(0.35793357933579334, 0.17037470725995316)\n", - "(0.3360655737704918, 0.1680327868852459)\n", - "(0.6457357075913777, 0.2002324905550712)\n", - "(0.6327519379844961, 0.1897704155768672)\n" + "(0.49415432579890883, 0.48976438779451525)\n", + "(0.49415432579890883, 0.48976438779451525)\n", + "(1.0, 0.019204655674102813)\n", + "(1.0, 0.026382153249272552)\n" ] } ], @@ -1023,27 +655,26 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "QI2 = [\"parents\", \"has_nurs\", \"form\", \"children\", \"housing\", \"finance\", \"social\", \"health\"]\n", - "anonymizer3 = Anonymize(100, QI2, categorical_features=categorical_features)\n", - "anon3 = anonymizer3.anonymize(x_train, x_train_predictions)" + "QI2_indexes = [i for i, v in enumerate(features) if v in QI2]\n", + "anonymizer3 = Anonymize(100, QI2_indexes, categorical_features=categorical_features_indexes)\n", + "anon3 = anonymizer3.anonymize(ArrayDataset(x_train, x_train_predictions))" ] }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "39" - ] + "text/plain": "39" }, - "execution_count": 112, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1055,22 +686,22 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Anonymized model accuracy: 0.7723765432098766\n", - "BB attack accuracy: 0.5792012348060969\n", - "WB attack accuracy: 0.6680493922438742\n" + "Anonymized model accuracy: 0.751929012345679\n", + "BB attack accuracy: 1.0\n", + "WB attack accuracy: 0.5187150299054601\n" ] } ], "source": [ "anon3_str = anon3.astype(str)\n", - "anon3_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon3_str)\n", + "anon3_encoded = OneHotEncoder(sparse=False).fit_transform(anon3_str)\n", "\n", "anon3_model = DecisionTreeClassifier()\n", "anon3_model.fit(anon3_encoded, y_train)\n", @@ -1105,17 +736,17 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(0.35793357933579334, 0.17037470725995316)\n", - "(0.3393939393939394, 0.13114754098360656)\n", - "(0.6457357075913777, 0.2002324905550712)\n", - "(1, 0.0)\n" + "(0.49415432579890883, 0.48976438779451525)\n", + "(0.49415432579890883, 0.48976438779451525)\n", + "(1.0, 0.019204655674102813)\n", + "(1.0, 0.032201745877788554)\n" ] } ], @@ -1162,4 +793,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/notebooks/membership_inference_anonymization_adult.ipynb b/notebooks/membership_inference_anonymization_adult.ipynb index c2c7e74..4a0ea00 100644 --- a/notebooks/membership_inference_anonymization_adult.ipynb +++ b/notebooks/membership_inference_anonymization_adult.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -44,6 +44,18 @@ " [ 26. 11. 0. 0. 48.]\n", " [ 27. 9. 0. 0. 40.]]\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_85828/3975777015.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " y_train = y_train.astype(np.int)\n", + "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_85828/3975777015.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " y_test = y_test.astype(np.int)\n" + ] } ], "source": [ @@ -90,14 +102,14 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Base model accuracy: 0.8075056814691972\n" + "Base model accuracy: 0.8074442601805786\n" ] } ], @@ -126,9 +138,18 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n" + ] + } + ], "source": [ "from art.attacks.inference.membership_inference import MembershipInferenceBlackBox\n", "\n", @@ -154,14 +175,14 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.5440363591696352\n" + "0.545264709495148\n" ] } ], @@ -197,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -215,6 +236,7 @@ } ], "source": [ + "from apt.utils.datasets import ArrayDataset\n", "import os\n", "import sys\n", "sys.path.insert(0, os.path.abspath('..'))\n", @@ -223,22 +245,20 @@ "# QI = (age, education-num, capital-gain, hours-per-week)\n", "QI = [0, 1, 2, 4]\n", "anonymizer = Anonymize(100, QI)\n", - "anon = anonymizer.anonymize(x_train, x_train_predictions)\n", + "anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n", "print(anon)" ] }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "6739" - ] + "text/plain": "6739" }, - "execution_count": 104, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -250,16 +270,14 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "658" - ] + "text/plain": "658" }, - "execution_count": 129, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -278,14 +296,14 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Anonymized model accuracy: 0.8304158221239482\n" + "Anonymized model accuracy: 0.83078434985566\n" ] } ], @@ -308,14 +326,22 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 14, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "0.5034393809114359\n" + "0.5047291487532244\n" ] } ], @@ -345,15 +371,15 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(0.5298924372550654, 0.7806166318634075)\n", - "(0.5030507735890172, 0.5671293452892765)\n" + "(0.5312420517168291, 0.7696843139663432)\n", + "(0.5048372911169745, 0.4935511607910576)\n" ] } ], @@ -419,4 +445,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/notebooks/membership_inference_dp_diabetes_reg.ipynb b/notebooks/membership_inference_dp_diabetes_reg.ipynb index 1376dc6..92922ab 100644 --- a/notebooks/membership_inference_dp_diabetes_reg.ipynb +++ b/notebooks/membership_inference_dp_diabetes_reg.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -86,14 +86,14 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.4954954954954955\n" + "0.527027027027027\n" ] } ], @@ -131,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -141,6 +141,22 @@ "unique rows in original data: 221\n" ] }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n", + "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n", + "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n", + "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n", + "/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -148,11 +164,12 @@ "k values: [5, 10, 20, 50, 75]\n", "unique rows: [34, 19, 8, 4, 2]\n", "model accuracy: [0.43165832354998956, 0.4509641063206041, -1.730181929385853, -5.577098823982753e+27, -1.2751609045828272e+25]\n", - "attack accuracy: [0.5, 0.47297297297297297, 0.49549549549549543, 0.5, 0.47297297297297297]\n" + "attack accuracy: [0.509009009009009, 0.481981981981982, 0.509009009009009, 0.5045045045045045, 0.4954954954954955]\n" ] } ], "source": [ + "from apt.utils.datasets import ArrayDataset\n", "from apt.anonymization import Anonymize\n", "k_values=[5, 10, 20, 50, 75]\n", "model_accuracy = []\n", @@ -165,7 +182,7 @@ "\n", "for k in k_values:\n", " anonymizer = Anonymize(k, QI, is_regression=True)\n", - " anon = anonymizer.anonymize(X_train, x_train_predictions)\n", + " anon = anonymizer.anonymize(ArrayDataset(X_train, x_train_predictions))\n", " unique_values.append(len(np.unique(anon, axis=0)))\n", " \n", " anon_model = LinearRegression()\n", @@ -198,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [] diff --git a/notebooks/minimization_adult.ipynb b/notebooks/minimization_adult.ipynb index 17610a3..e8ccc20 100644 --- a/notebooks/minimization_adult.ipynb +++ b/notebooks/minimization_adult.ipynb @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -42,6 +42,18 @@ " [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n", " [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " y_train = y_train.astype(np.int)\n", + "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " y_test = y_test.astype(np.int)\n" + ] } ], "source": [ @@ -84,24 +96,27 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Base model accuracy: 0.8189914624408821\n" + "Base model accuracy: 0.8183158282660771\n" ] } ], "source": [ + "from apt.utils.datasets import ArrayDataset\n", + "from apt.utils.models import SklearnClassifier, ModelOutputType\n", "from sklearn.tree import DecisionTreeClassifier\n", "\n", - "model = DecisionTreeClassifier()\n", - "model.fit(x_train, y_train)\n", + "base_est = DecisionTreeClassifier()\n", + "model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)\n", + "model.fit(ArrayDataset(x_train, y_train))\n", "\n", - "print('Base model accuracy: ', model.score(x_test, y_test))" + "print('Base model accuracy: ', model.score(ArrayDataset(x_test, y_test)))" ] }, { @@ -114,26 +129,26 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n", + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n", "Improving accuracy\n", - "feature to remove: 0\n", - "Removed feature: 0, new relative accuracy: 0.939867\n", - "feature to remove: 4\n", - "Removed feature: 4, new relative accuracy: 0.967247\n", "feature to remove: 2\n", - "Removed feature: 2, new relative accuracy: 0.972620\n", + "Removed feature: 2, new relative accuracy: 0.935261\n", + "feature to remove: 4\n", + "Removed feature: 4, new relative accuracy: 0.946776\n", + "feature to remove: 0\n", + "Removed feature: 0, new relative accuracy: 0.972876\n", "feature to remove: 1\n", - "Removed feature: 1, new relative accuracy: 0.992323\n", + "Removed feature: 1, new relative accuracy: 0.992835\n", "feature to remove: 3\n", "Removed feature: 3, new relative accuracy: 1.000000\n", - "Accuracy on minimized data: 0.8237371411024106\n" + "Accuracy on minimized data: 0.8231229847996315\n" ] } ], @@ -155,10 +170,12 @@ "X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n", " test_size = 0.4, random_state = 38)\n", "x_train_predictions = model.predict(X_generalizer_train)\n", - "minimizer.fit(X_generalizer_train, x_train_predictions)\n", - "transformed = minimizer.transform(x_test)\n", + "if x_train_predictions.shape[1] > 1:\n", + " x_train_predictions = np.argmax(x_train_predictions, axis=1)\n", + "minimizer.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n", + "transformed = minimizer.transform(dataset=ArrayDataset(x_test))\n", "\n", - "print('Accuracy on minimized data: ', model.score(transformed, y_test))" + "print('Accuracy on minimized data: ', model.score(ArrayDataset(transformed, y_test)))" ] }, { @@ -170,14 +187,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'ranges': {}, 'untouched': [0, 1, 2, 3, 4]}\n" + "{'ranges': {}, 'categories': {}, 'untouched': ['4', '1', '3', '0', '2']}\n" ] } ], @@ -197,25 +214,25 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n", + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n", "Improving accuracy\n", - "feature to remove: 0\n", - "Removed feature: 0, new relative accuracy: 0.939867\n", - "feature to remove: 4\n", - "Removed feature: 4, new relative accuracy: 0.967247\n", "feature to remove: 2\n", - "Removed feature: 2, new relative accuracy: 0.972620\n", + "Removed feature: 2, new relative accuracy: 0.935261\n", + "feature to remove: 4\n", + "Removed feature: 4, new relative accuracy: 0.946776\n", + "feature to remove: 0\n", + "Removed feature: 0, new relative accuracy: 0.972876\n", "feature to remove: 1\n", - "Removed feature: 1, new relative accuracy: 0.992323\n", - "Accuracy on minimized data: 0.820205742361431\n", - "{'ranges': {3: [546.0, 704.0, 705.5, 742.5, 782.0, 834.0, 870.0, 1446.5, 1538.5, 1612.5, 1699.0, 1744.0, 1801.0, 1814.0, 1846.0, 1881.5, 1978.5, 2248.0, 2298.5, 2537.5]}, 'untouched': [0, 1, 2, 4]}\n" + "Removed feature: 1, new relative accuracy: 0.992835\n", + "Accuracy on minimized data: 0.8192845079072624\n", + "{'ranges': {'3': [569.0, 782.0, 870.0, 870.5, 938.0, 1016.5, 1311.5, 1457.0, 1494.5, 1596.0, 1629.5, 1684.0, 1805.0, 1859.0, 1867.5, 1881.5, 1938.0, 1978.5, 2119.0, 2210.0, 2218.0, 2244.5, 2298.5, 2443.5]}, 'categories': {}, 'untouched': ['2', '1', '0', '4']}\n" ] } ], @@ -223,9 +240,9 @@ "# We allow a 1% deviation in accuracy from the original model accuracy\n", "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.99)\n", "\n", - "minimizer2.fit(X_generalizer_train, x_train_predictions)\n", - "transformed2 = minimizer2.transform(x_test)\n", - "print('Accuracy on minimized data: ', model.score(transformed2, y_test))\n", + "minimizer2.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n", + "transformed2 = minimizer2.transform(dataset=ArrayDataset(x_test))\n", + "print('Accuracy on minimized data: ', model.score(test_data=ArrayDataset(transformed2, y_test)))\n", "generalizations2 = minimizer2.generalizations\n", "print(generalizations2)" ] @@ -259,4 +276,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fa4131d..ec37771 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ numpy==1.21.0 pandas==1.1.0 scipy==1.4.1 scikit-learn==0.22.2 +adversarial-robustness-toolkit>=1.9.1 # testing pytest==5.4.2 diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py index 000eefa..358398c 100644 --- a/tests/test_anonymizer.py +++ b/tests/test_anonymizer.py @@ -7,13 +7,15 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.preprocessing import OneHotEncoder from apt.anonymization import Anonymize -from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset +from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split +from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE def test_anonymize_ndarray_iris(): (x_train, y_train), _ = get_iris_dataset() + model = DecisionTreeClassifier() model.fit(x_train, y_train) pred = model.predict(x_train) @@ -21,7 +23,7 @@ def test_anonymize_ndarray_iris(): k = 10 QI = [0, 2] anonymizer = Anonymize(k, QI, train_only_QI=True) - anon = anonymizer.anonymize(x_train, pred) + anon = anonymizer.anonymize(ArrayDataset(x_train, pred)) assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0))) _, counts_elements = np.unique(anon[:, QI], return_counts=True) assert (np.min(counts_elements) >= k) @@ -30,10 +32,14 @@ def test_anonymize_ndarray_iris(): def test_anonymize_pandas_adult(): (x_train, y_train), _ = get_adult_dataset() + encoded = OneHotEncoder().fit_transform(x_train) + model = DecisionTreeClassifier() + model.fit(encoded, y_train) + pred = model.predict(encoded) k = 100 - features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', - 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'] + features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'] QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', @@ -56,12 +62,11 @@ def test_anonymize_pandas_adult(): pred = model.predict(encoded) anonymizer = Anonymize(k, QI, categorical_features=categorical_features) - anon = anonymizer.anonymize(x_train, pred) + anon = anonymizer.anonymize(ArrayDataset(x_train, pred, features)) assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0]) assert (anon.loc[:, QI].value_counts().min() >= k) - assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1))) - + np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1)) def test_anonymize_pandas_nursery(): (x_train, y_train), _ = get_nursery_dataset() @@ -89,11 +94,11 @@ def test_anonymize_pandas_nursery(): pred = model.predict(encoded) anonymizer = Anonymize(k, QI, categorical_features=categorical_features, train_only_QI=True) - anon = anonymizer.anonymize(x_train, pred) + anon = anonymizer.anonymize(ArrayDataset(x_train, pred)) assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0]) assert (anon.loc[:, QI].value_counts().min() >= k) - assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1))) + np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1)) def test_regression(): @@ -107,7 +112,7 @@ def test_regression(): k = 10 QI = [0, 2, 5, 8] anonymizer = Anonymize(k, QI, is_regression=True, train_only_QI=True) - anon = anonymizer.anonymize(x_train, pred) + anon = anonymizer.anonymize(ArrayDataset(x_train, pred)) print('Base model accuracy (R2 score): ', model.score(x_test, y_test)) model.fit(anon, y_train) print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test)) @@ -127,7 +132,7 @@ def test_errors(): anonymizer = Anonymize(10, [0, 2]) (x_train, y_train), (x_test, y_test) = get_iris_dataset() with pytest.raises(ValueError): - anonymizer.anonymize(x_train, y_test) + anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test)) (x_train, y_train), _ = get_adult_dataset() with pytest.raises(ValueError): - anonymizer.anonymize(x_train, y_train) + anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test)) diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index e6f50be..630cd49 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -5,14 +5,15 @@ from sklearn.compose import ColumnTransformer from sklearn.datasets import load_boston, load_diabetes from sklearn.impute import SimpleImputer -from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.preprocessing import OneHotEncoder from apt.minimization import GeneralizeToRepresentative from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset +from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset +from apt.utils.datasets import ArrayDataset +from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor @pytest.fixture @@ -38,11 +39,12 @@ def test_minimizer_params(data): y = [1, 1, 0] base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) - base_est.fit(X, y) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(X, y)) - gen = GeneralizeToRepresentative(base_est, features=features, cells=cells) + gen = GeneralizeToRepresentative(model, cells=cells) gen.fit() - transformed = gen.transform(X) + transformed = gen.transform(dataset=ArrayDataset(X, features_names=features)) def test_minimizer_fit(data): @@ -58,15 +60,20 @@ def test_minimizer_fit(data): [69, 175], [24, 181], [18, 190]]) - y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) - base_est.fit(X, y) - predictions = base_est.predict(X) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(X, y)) + predictions = model.predict(X) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) - gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5) - gen.fit(X, predictions) - transformed = gen.transform(X) + gen = GeneralizeToRepresentative(model, target_accuracy=0.5) + train_dataset = ArrayDataset(X, predictions, features_names=features) + + gen.fit(dataset=train_dataset) + transformed = gen.transform(dataset=ArrayDataset(X)) gener = gen.generalizations_ expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']} @@ -103,7 +110,7 @@ def test_minimizer_fit_pandas(data): [69, 175, 'm', 'aa'], [24, 181, 'm', 'bb'], [18, 190, 'm', 'bb']] - y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) X = pd.DataFrame(X, columns=features) numeric_features = ["age", "height"] @@ -121,16 +128,22 @@ def test_minimizer_fit_pandas(data): ] ) encoded = preprocessor.fit_transform(X) + encoded = pd.DataFrame(encoded) base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) - base_est.fit(encoded, y) - predictions = base_est.predict(encoded) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(encoded, y)) + predictions = model.predict(encoded) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. - gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, + gen = GeneralizeToRepresentative(model, target_accuracy=0.5, categorical_features=categorical_features) - gen.fit(X, predictions) - transformed = gen.transform(X) + train_dataset = ArrayDataset(X, predictions) + gen.fit(dataset=train_dataset) + transformed = gen.transform(dataset=ArrayDataset(X)) gener = gen.generalizations_ expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['ola', 'height', 'sex']} @@ -143,7 +156,7 @@ def test_minimizer_fit_pandas(data): modified_features = [f for f in features if f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ 'ranges'].keys()] - assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1))) + np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1)) ncp = gen.ncp_ if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: assert (ncp > 0) @@ -179,7 +192,7 @@ def test_minimizer_params_categorical(data): [24, 181, 'm'], [18, 190, 'm']] - y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) X = pd.DataFrame(X, columns=features) numeric_features = ["age", "height"] numeric_transformer = Pipeline( @@ -196,16 +209,21 @@ def test_minimizer_params_categorical(data): ] ) encoded = preprocessor.fit_transform(X) + encoded = pd.DataFrame(encoded) base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) - base_est.fit(encoded, y) - predictions = base_est.predict(encoded) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(encoded, y)) + predictions = model.predict(encoded) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. - gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, + gen = GeneralizeToRepresentative(model, target_accuracy=0.5, categorical_features=categorical_features, cells=cells) - gen.fit(X, predictions) - transformed = gen.transform(X) + train_dataset = ArrayDataset(X, predictions) + gen.fit(dataset=train_dataset) + transformed = gen.transform(dataset=ArrayDataset(X)) def test_minimizer_fit_QI(data): @@ -222,16 +240,20 @@ def test_minimizer_fit_QI(data): [24, 181, 95], [18, 190, 102]]) print(X) - y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] - QI = [0, 2] + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + QI = ['age', 'weight'] base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) - base_est.fit(X, y) - predictions = base_est.predict(X) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(X, y)) + predictions = model.predict(X) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) - gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, features_to_minimize=QI) - gen.fit(X, predictions) - transformed = gen.transform(X) + gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI) + train_dataset = ArrayDataset(X, predictions, features_names=features) + gen.fit(dataset=train_dataset) + transformed = gen.transform(dataset=ArrayDataset(X)) gener = gen.generalizations_ expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']} for key in expexted_generalizations['ranges']: @@ -240,7 +262,7 @@ def test_minimizer_fit_QI(data): assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == set([frozenset(sl) for sl in gener['categories'][key]])) assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) - assert ((np.delete(transformed, QI, axis=1) == np.delete(X, QI, axis=1)).all()) + assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all()) modified_features = [f for f in features if f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ 'ranges'].keys()] @@ -269,7 +291,7 @@ def test_minimizer_fit_pandas_QI(data): [24, 181, 49, 'm', 'bb'], [18, 190, 69, 'm', 'bb']] - y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] + y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) X = pd.DataFrame(X, columns=features) QI = ['age', 'weight', 'ola'] @@ -288,16 +310,22 @@ def test_minimizer_fit_pandas_QI(data): ] ) encoded = preprocessor.fit_transform(X) + encoded = pd.DataFrame(encoded) base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) - base_est.fit(encoded, y) - predictions = base_est.predict(encoded) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(encoded, y)) + predictions = model.predict(encoded) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. - gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, + gen = GeneralizeToRepresentative(model, target_accuracy=0.5, categorical_features=categorical_features, features_to_minimize=QI) - gen.fit(X, predictions) - transformed = gen.transform(X) + train_dataset = ArrayDataset(X, predictions) + gen.fit(dataset=train_dataset) + transformed = gen.transform(dataset=ArrayDataset(X)) gener = gen.generalizations_ expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]}, 'untouched': ['height', 'sex']} @@ -308,12 +336,13 @@ def test_minimizer_fit_pandas_QI(data): assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == set([frozenset(sl) for sl in gener['categories'][key]])) assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) - assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1))) - + # assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1))) + np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1)) modified_features = [f for f in features if f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ 'ranges'].keys()] - assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1))) + # assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1))) + np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1)) ncp = gen.ncp_ if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: assert (ncp > 0) @@ -322,16 +351,19 @@ def test_minimizer_fit_pandas_QI(data): def test_minimize_ndarray_iris(): features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] - (x_train, y_train), _ = get_iris_dataset() - QI = [0, 2] - model = DecisionTreeClassifier(random_state=0, min_samples_split=2, - min_samples_leaf=1) - model.fit(x_train, y_train) - pred = model.predict(x_train) + (x_train, y_train), (x_test, y_test) = get_iris_dataset() + QI = ['sepal length (cm)', 'petal length (cm)'] + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(x_train, y_train)) + predictions = model.predict(x_train) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) - gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features=features, features_to_minimize=QI) - gen.fit(x_train, pred) - transformed = gen.transform(x_train) + gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features_to_minimize=QI) + # gen.fit(dataset=ArrayDataset(x_train, predictions)) + transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features)) gener = gen.generalizations_ expexted_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]}, 'categories': {}, 'untouched': ['petal width (cm)', 'sepal width (cm)']} @@ -342,7 +374,7 @@ def test_minimize_ndarray_iris(): assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == set([frozenset(sl) for sl in gener['categories'][key]])) assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) - assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all()) + assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x_train, [0, 2], axis=1)).all()) modified_features = [f for f in features if f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ @@ -359,12 +391,13 @@ def test_minimize_ndarray_iris(): def test_minimize_pandas_adult(): - (x_train, y_train), _ = get_adult_dataset() + (x_train, y_train), (x_test, y_test) = get_adult_dataset() x_train = x_train.head(1000) y_train = y_train.head(1000) features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'] + x_train = pd.DataFrame(x_train, columns=features) categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'hours-per-week', 'native-country'] @@ -384,15 +417,19 @@ def test_minimize_pandas_adult(): ] ) encoded = preprocessor.fit_transform(x_train) + encoded = pd.DataFrame(encoded) base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) - base_est.fit(encoded, y_train) - predictions = base_est.predict(encoded) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(encoded, y_train)) + predictions = model.predict(encoded) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) - gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features, + gen = GeneralizeToRepresentative(model, target_accuracy=0.7, categorical_features=categorical_features, features_to_minimize=QI) - gen.fit(x_train, predictions) - transformed = gen.transform(x_train) + gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features)) + transformed = gen.transform(dataset=ArrayDataset(x_train)) gener = gen.generalizations_ expexted_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': { 'workclass': [['Self-emp-not-inc', 'Private', 'Federal-gov', 'Self-emp-inc', '?', 'Local-gov', 'State-gov']], @@ -414,12 +451,14 @@ def test_minimize_pandas_adult(): assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == set([frozenset(sl) for sl in gener['categories'][key]])) assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) - assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1))) + # assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1))) + np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1)) modified_features = [f for f in features if f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ 'ranges'].keys()] - assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1))) + # assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1))) + np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1)) ncp = gen.ncp_ if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: assert (ncp > 0) @@ -451,15 +490,19 @@ def test_german_credit_pandas(): ] ) encoded = preprocessor.fit_transform(x_train) + encoded = pd.DataFrame(encoded) base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) - base_est.fit(encoded, y_train) - predictions = base_est.predict(encoded) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(encoded, y_train)) + predictions = model.predict(encoded) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) - gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features, + gen = GeneralizeToRepresentative(model, target_accuracy=0.7, categorical_features=categorical_features, features_to_minimize=QI) - gen.fit(x_train, predictions) - transformed = gen.transform(x_train) + gen.fit(dataset=ArrayDataset(x_train, predictions)) + transformed = gen.transform(dataset=ArrayDataset(x_train)) gener = gen.generalizations_ expexted_generalizations = {'ranges': {'Duration_in_month': [31.5]}, 'categories': {'Credit_history': [['A30', 'A32', 'A31', 'A34', 'A33']], 'Purpose': [ @@ -481,12 +524,14 @@ def test_german_credit_pandas(): assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == set([frozenset(sl) for sl in gener['categories'][key]])) assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) - assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1))) + # assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1))) + np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1)) modified_features = [f for f in features if f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ 'ranges'].keys()] - assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1))) + # assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1))) + np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1)) ncp = gen.ncp_ if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: assert (ncp > 0) @@ -497,17 +542,258 @@ def test_regression(): dataset = load_diabetes() x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14) - model = DecisionTreeRegressor(random_state=10, min_samples_split=2) - model.fit(x_train, y_train) - pred = model.predict(x_train) - QI = [0, 2, 5, 8] + base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2) + model = SklearnRegressor(base_est) + model.fit(ArrayDataset(x_train, y_train)) + predictions = model.predict(x_train) + QI = ['age', 'bmi', 's2', 's5'] features = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'] - gen = GeneralizeToRepresentative(model, target_accuracy=0.7, features=features, is_regression=True, + gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True, features_to_minimize=QI) - gen.fit(x_train, pred) - transformed = gen.transform(x_train) + gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features)) + transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features)) + print('Base model accuracy (R2 score): ', model.score(ArrayDataset(x_test, y_test))) + model.fit(ArrayDataset(transformed, y_train)) + print('Base model accuracy (R2 score) after anonymization: ', model.score(ArrayDataset(x_test, y_test))) + gener = gen.generalizations_ + expexted_generalizations = {'ranges': { + 'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433, + -0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773, + -0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298, + 0.0017505218856967986, 0.0035667913616634905, 0.007199329789727926, 0.010831868276000023, + 0.02354575227946043, 0.030810829252004623, 0.03262709779664874, 0.03444336913526058, + 0.03625963814556599, 0.03807590529322624, 0.03807590715587139, 0.047157252207398415, + 0.06168740428984165, 0.0635036751627922, 0.06895248219370842, 0.07258502021431923, 0.07621755823493004, + 0.1034616008400917], + 'bmi': [-0.07626373693346977, -0.060635464265942574, -0.056863121688365936, -0.05578530766069889, + -0.054168591275811195, -0.042312657460570335, -0.0374625027179718, -0.03422906715422869, + -0.033690162003040314, -0.03261234890669584, -0.02614547684788704, -0.025067666545510292, + -0.022373135201632977, -0.016984074376523495, -0.01375063881278038, -0.007822672137990594, + -0.004589236050378531, 0.008344509289599955, 0.015889193629845977, 0.016967005096375942, + 0.024511689320206642, 0.0272062208969146, 0.030978563241660595, 0.032595280557870865, + 0.033673093654215336, 0.04391230642795563, 0.04552902653813362, 0.05469042807817459, + 0.06977979838848114, 0.07301323488354683, 0.09349166229367256], + 's2': [-0.1044962927699089, -0.08649025857448578, -0.07740895450115204, -0.07114598527550697, + -0.06378699466586113, -0.05971606448292732, -0.04437179118394852, -0.0398311372846365, + -0.03137612994760275, -0.022138250060379505, -0.018067320343106985, -0.017910746857523918, + -0.017910745926201344, -0.01618842873722315, -0.007576846517622471, -0.007263698382303119, + -0.0010007291566580534, 0.0010347360512241721, 0.006514834007248282, 0.00933317095041275, + 0.012464655097573996, 0.019197346206055954, 0.020919663831591606, 0.02217225730419159, + 0.032036433927714825, 0.036420512944459915, 0.04080459102988243, 0.04127431474626064, + 0.04268348217010498, 0.04424922354519367, 0.04424922540783882, 0.056462014093995094, 0.05928034894168377, + 0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {}, + 'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']} + + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all()) + + modified_features = [f for f in features if + f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ + 'ranges'].keys()] + indexes = [] + for i in range(len(features)): + if features[i] in modified_features: + indexes.append(i) + assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all()) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[indexes]) != (x_train[indexes])).any()) + + +def test_X_y(data): + features = [0, 1, 2] + X = np.array([[23, 165, 70], + [45, 158, 67], + [56, 123, 65], + [67, 154, 90], + [45, 149, 67], + [42, 166, 58], + [73, 172, 68], + [94, 168, 69], + [69, 175, 80], + [24, 181, 95], + [18, 190, 102]]) + print(X) + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + QI = [0, 2] + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(X, y)) + predictions = model.predict(X) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + + gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI) + gen.fit(X=X, y=predictions) + transformed = gen.transform(X) + gener = gen.generalizations_ + expexted_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']} + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all()) + modified_features = [f for f in features if + str(f) in expexted_generalizations['categories'].keys() or str(f) in expexted_generalizations[ + 'ranges'].keys()] + indexes = [] + for i in range(len(features)): + if features[i] in modified_features: + indexes.append(i) + assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[indexes]) != (X[indexes])).any()) + + +def test_X_y_features_names(data): + features = ['age', 'height', 'weight'] + X = np.array([[23, 165, 70], + [45, 158, 67], + [56, 123, 65], + [67, 154, 90], + [45, 149, 67], + [42, 166, 58], + [73, 172, 68], + [94, 168, 69], + [69, 175, 80], + [24, 181, 95], + [18, 190, 102]]) + print(X) + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + QI = ['age', 'weight'] + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR) + model.fit(ArrayDataset(X, y)) + predictions = model.predict(X) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + + gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI) + gen.fit(X=X, y=predictions, features_names=features) + transformed = gen.transform(X=X, features_names=features) + gener = gen.generalizations_ + expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']} + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all()) + modified_features = [f for f in features if + f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ + 'ranges'].keys()] + indexes = [] + for i in range(len(features)): + if features[i] in modified_features: + indexes.append(i) + assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[indexes]) != (X[indexes])).any()) + + +def test_BaseEstimator_classification(data): + features = ['age', 'height', 'weight', 'sex', 'ola'] + X = [[23, 165, 65, 'f', 'aa'], + [45, 158, 76, 'f', 'aa'], + [56, 123, 78, 'f', 'bb'], + [67, 154, 87, 'm', 'aa'], + [45, 149, 45, 'f', 'bb'], + [42, 166, 76, 'm', 'bb'], + [73, 172, 85, 'm', 'bb'], + [94, 168, 92, 'f', 'aa'], + [69, 175, 95, 'm', 'aa'], + [24, 181, 49, 'm', 'bb'], + [18, 190, 69, 'm', 'bb']] + + y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + X = pd.DataFrame(X, columns=features) + QI = ['age', 'weight', 'ola'] + + numeric_features = ["age", "height", "weight"] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + + categorical_features = ["sex", "ola"] + categorical_transformer = OneHotEncoder(handle_unknown="ignore") + + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(X) + encoded = pd.DataFrame(encoded) + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = base_est + model.fit(encoded, y) + predictions = model.predict(encoded) + + # Append classifier to preprocessing pipeline. + # Now we have a full prediction pipeline. + gen = GeneralizeToRepresentative(model, target_accuracy=0.5, + categorical_features=categorical_features, features_to_minimize=QI) + train_dataset = ArrayDataset(X, predictions) + gen.fit(dataset=train_dataset) + transformed = gen.transform(dataset=ArrayDataset(X)) + gener = gen.generalizations_ + expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]}, + 'untouched': ['height', 'sex']} + + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + # assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1))) + np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1)) + modified_features = [f for f in features if + f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ + 'ranges'].keys()] + # assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1))) + np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1)) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[modified_features]).equals(X[modified_features])) == False) + + +def test_BaseEstimator_regression(): + dataset = load_diabetes() + x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14) + + base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2) + model = base_est + model.fit(x_train, y_train) + predictions = model.predict(x_train) + QI = ['age', 'bmi', 's2', 's5'] + features = ['age', 'sex', 'bmi', 'bp', + 's1', 's2', 's3', 's4', 's5', 's6'] + + gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True, + features_to_minimize=QI) + gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features)) + transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features)) print('Base model accuracy (R2 score): ', model.score(x_test, y_test)) model.fit(transformed, y_train) print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test)) @@ -546,7 +832,7 @@ def test_regression(): assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == set([frozenset(sl) for sl in gener['categories'][key]])) assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) - assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all()) + assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all()) modified_features = [f for f in features if f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 0000000..bbb951b --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,35 @@ +import pytest + +from apt.utils.models import SklearnClassifier, SklearnRegressor, ModelOutputType +from apt.utils.datasets import ArrayDataset +from apt.utils import dataset_utils + +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import RandomForestClassifier + + +def test_sklearn_classifier(): + (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset() + underlying_model = RandomForestClassifier() + model = SklearnClassifier(underlying_model, ModelOutputType.CLASSIFIER_VECTOR) + train = ArrayDataset(x_train, y_train) + test = ArrayDataset(x_test, y_test) + model.fit(train) + pred = model.predict(x_test) + assert(pred.shape[0] == x_test.shape[0]) + + score = model.score(test) + assert(0.0 <= score <= 1.0) + + +def test_sklearn_regressor(): + (x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset() + underlying_model = DecisionTreeRegressor() + model = SklearnRegressor(underlying_model) + train = ArrayDataset(x_train, y_train) + test = ArrayDataset(x_test, y_test) + model.fit(train) + pred = model.predict(x_test) + assert (pred.shape[0] == x_test.shape[0]) + + score = model.score(test)