mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-25 04:46:21 +02:00
Support 1-hot encoded features in anonymization + fixes related to encoding in minimization (#86)
* Support 1-hot encoded features in anonymization (#72) * Fix anonymization adult notebook + new notebook to demonstrate anonymization on 1-hot encoded data * Minimizer: No default encoder, if none provided data is supplied to the model as is. Fix data type of representative values. Fix and add more tests. Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
26addd192f
commit
5dce961092
7 changed files with 670 additions and 255 deletions
|
|
@ -23,7 +23,11 @@ class Anonymize:
|
|||
:type k: int
|
||||
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
|
||||
in case of numpy data.
|
||||
:type quasi_identifiers: np.ndarray or list
|
||||
:type quasi_identifiers: np.ndarray or list of strings or integers.
|
||||
:param quasi_identifer_slices: If some of the quasi-identifiers represent 1-hot encoded features that need to remain
|
||||
consistent after anonymization, provide a list containing the list of column names
|
||||
or indexes that represent a single feature.
|
||||
:type quasi_identifer_slices: list of lists of strings or integers.
|
||||
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
|
||||
before using them to train the decision tree model).
|
||||
:type categorical_features: list, optional
|
||||
|
|
@ -35,8 +39,12 @@ class Anonymize:
|
|||
:type train_only_QI: boolean, optional
|
||||
"""
|
||||
|
||||
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
|
||||
is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
|
||||
def __init__(self, k: int,
|
||||
quasi_identifiers: Union[np.ndarray, list],
|
||||
quasi_identifer_slices: Optional[list] = None,
|
||||
categorical_features: Optional[list] = None,
|
||||
is_regression: Optional[bool] = False,
|
||||
train_only_QI: Optional[bool] = False):
|
||||
if k < 2:
|
||||
raise ValueError("k should be a positive integer with a value of 2 or higher")
|
||||
if quasi_identifiers is None or len(quasi_identifiers) < 1:
|
||||
|
|
@ -49,6 +57,7 @@ class Anonymize:
|
|||
self.train_only_QI = train_only_QI
|
||||
self.features_names = None
|
||||
self.features = None
|
||||
self.quasi_identifer_slices = quasi_identifer_slices
|
||||
|
||||
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
|
||||
"""
|
||||
|
|
@ -76,7 +85,14 @@ class Anonymize:
|
|||
if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
|
||||
raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
|
||||
'the data columns')
|
||||
# transform quasi identifiers to indexes
|
||||
self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
|
||||
if self.quasi_identifer_slices:
|
||||
temp_list = []
|
||||
for slice in self.quasi_identifer_slices:
|
||||
new_slice = [i for i, v in enumerate(self.features_names) if v in slice]
|
||||
temp_list.append(new_slice)
|
||||
self.quasi_identifer_slices = temp_list
|
||||
if self.categorical_features:
|
||||
self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]
|
||||
|
||||
|
|
@ -126,31 +142,49 @@ class Anonymize:
|
|||
return cells_by_id
|
||||
|
||||
def _find_representatives(self, x, x_anonymizer_train, cells):
|
||||
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
|
||||
# x is original data (always numpy), x_anonymizer_train is only QIs + 1-hot encoded
|
||||
node_ids = self._find_sample_nodes(x_anonymizer_train)
|
||||
if self.quasi_identifer_slices:
|
||||
all_one_hot_features = set([feature for encoded in self.quasi_identifer_slices for feature in encoded])
|
||||
else:
|
||||
all_one_hot_features = set()
|
||||
for cell in cells:
|
||||
cell['representative'] = {}
|
||||
# get all rows in cell
|
||||
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
|
||||
# TODO: should we filter only those with majority label? (using hist)
|
||||
rows = x[indexes]
|
||||
for feature in self.quasi_identifiers:
|
||||
values = rows[:, feature]
|
||||
if self.categorical_features and feature in self.categorical_features:
|
||||
# find most common value
|
||||
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
|
||||
else:
|
||||
# find the mean value (per feature)
|
||||
median = np.median(values)
|
||||
min_value = max(values)
|
||||
min_dist = float("inf")
|
||||
for value in values:
|
||||
# euclidean distance between two floating point values
|
||||
dist = abs(value - median)
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
min_value = value
|
||||
cell['representative'][feature] = min_value
|
||||
done = set()
|
||||
for feature in self.quasi_identifiers: # self.quasi_identifiers are numerical indexes
|
||||
if feature not in done:
|
||||
# deal with 1-hot encoded features
|
||||
if feature in all_one_hot_features:
|
||||
# find features that belong together
|
||||
for encoded in self.quasi_identifer_slices:
|
||||
if feature in encoded:
|
||||
values = rows[:, encoded]
|
||||
unique_rows, counts = np.unique(values, axis=0, return_counts=True)
|
||||
rep = unique_rows[np.argmax(counts)]
|
||||
for i, e in enumerate(encoded):
|
||||
done.add(e)
|
||||
cell['representative'][e] = rep[i]
|
||||
else: # rest of features
|
||||
values = rows[:, feature]
|
||||
if self.categorical_features and feature in self.categorical_features:
|
||||
# find most common value
|
||||
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
|
||||
else:
|
||||
# find the mean value (per feature)
|
||||
median = np.median(values)
|
||||
min_value = max(values)
|
||||
min_dist = float("inf")
|
||||
for value in values:
|
||||
# euclidean distance between two floating point values
|
||||
dist = abs(value - median)
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
min_value = value
|
||||
cell['representative'][feature] = min_value
|
||||
|
||||
def _find_sample_nodes(self, samples):
|
||||
paths = self._anonymizer.decision_path(samples).toarray()
|
||||
|
|
|
|||
|
|
@ -10,9 +10,6 @@ import copy
|
|||
import sys
|
||||
from scipy.spatial import distance
|
||||
from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
|
|
@ -57,7 +54,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
|
||||
encoded before using them to train the decision tree model).
|
||||
:param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
|
||||
features)
|
||||
features). If not provided, the data will be fed as is directly to the estimator.
|
||||
:type encoder: sklearn OrdinalEncoder or OneHotEncoder
|
||||
:type categorical_features: list of strings, optional
|
||||
:param features_to_minimize: The features to be minimized.
|
||||
|
|
@ -256,7 +253,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# Going to fit
|
||||
# (currently not dealing with option to fit with only X and y and no estimator)
|
||||
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
|
||||
dtype = dataset.get_samples().dtype
|
||||
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
|
||||
if not self.features_to_minimize:
|
||||
self.features_to_minimize = self._features
|
||||
|
|
@ -293,21 +289,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# collect feature data (such as min, max)
|
||||
self._feature_data = self._get_feature_data(x)
|
||||
|
||||
# default encoder in case none provided
|
||||
if self.encoder is None:
|
||||
numeric_features = [f for f in self._features if f not in self.categorical_features]
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
)
|
||||
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
self.encoder = ColumnTransformer(
|
||||
transformers=[
|
||||
("num", numeric_transformer, numeric_features),
|
||||
("cat", categorical_transformer, self.categorical_features),
|
||||
]
|
||||
)
|
||||
self.encoder.fit(x)
|
||||
|
||||
self.cells = []
|
||||
self._categorical_values = {}
|
||||
|
||||
|
|
@ -334,14 +315,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
|
||||
|
||||
# self._cells currently holds the generalization created from the tree leaves
|
||||
self._calculate_generalizations(x_test)
|
||||
if self.generalize_using_transform:
|
||||
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
||||
else:
|
||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||
generalized = self._generalize(x_test, x_prepared_test, nodes)
|
||||
|
||||
# check accuracy
|
||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), y_test))
|
||||
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
|
||||
print('Initial accuracy of model on generalized data, relative to original model predictions '
|
||||
'(base generalization derived from tree, before improvements): %f' % accuracy)
|
||||
|
||||
|
|
@ -364,15 +341,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
|
||||
|
||||
self._calculate_generalizations(x_test)
|
||||
if self.generalize_using_transform:
|
||||
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
||||
self._cells_by_id)
|
||||
else:
|
||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||
|
||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
|
||||
y_test))
|
||||
generalized = self._generalize(x_test, x_prepared_test, nodes)
|
||||
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
|
||||
# if accuracy passed threshold roll back to previous iteration generalizations
|
||||
if accuracy < self.target_accuracy:
|
||||
self.cells = cells_previous_iter
|
||||
|
|
@ -395,14 +365,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
if removed_feature is None:
|
||||
break
|
||||
|
||||
self._calculate_generalizations(x_test)
|
||||
if self.generalize_using_transform:
|
||||
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
||||
self._cells_by_id)
|
||||
else:
|
||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
|
||||
y_test))
|
||||
generalized = self._generalize(x_test, x_prepared_test, nodes)
|
||||
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
|
||||
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
|
||||
|
||||
# self._cells currently holds the chosen generalization based on target accuracy
|
||||
|
|
@ -893,7 +857,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
def _generalize_indexes(self, original_data, cells, all_indexes):
|
||||
# prepared data include one hot encoded categorical data + QI
|
||||
representatives = pd.DataFrame(columns=self._features) # empty except for columns
|
||||
dtypes = original_data.dtypes.to_dict()
|
||||
new_dtypes = {}
|
||||
for t in dtypes.keys():
|
||||
new_dtypes[t] = pd.Series(dtype=dtypes[t].name)
|
||||
representatives = pd.DataFrame(new_dtypes) # empty except for columns
|
||||
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
|
||||
|
||||
# iterate over cells (leaves in decision tree)
|
||||
|
|
@ -925,6 +893,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
return original_data_generalized
|
||||
|
||||
def _generalize(self, data, data_prepared, nodes):
|
||||
self._calculate_generalizations(data)
|
||||
if self.generalize_using_transform:
|
||||
generalized = self._generalize_from_tree(data, data_prepared, nodes, self.cells,
|
||||
self._cells_by_id)
|
||||
else:
|
||||
generalized = self._generalize_from_generalizations(data, self.generalizations)
|
||||
return generalized
|
||||
|
||||
@staticmethod
|
||||
def _map_to_ranges_categories(samples, ranges, categories):
|
||||
all_sample_indexes = []
|
||||
|
|
@ -994,18 +971,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
feature_data[feature],
|
||||
total)
|
||||
if feature_ncp > 0:
|
||||
# divide by accuracy gain
|
||||
new_cells = copy.deepcopy(self.cells)
|
||||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
|
||||
cells_by_id)
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
||||
labels)) - current_accuracy
|
||||
if accuracy_gain < 0:
|
||||
accuracy_gain = 0
|
||||
if accuracy_gain != 0:
|
||||
feature_ncp = feature_ncp / accuracy_gain
|
||||
feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
|
||||
feature_ncp, labels, current_accuracy)
|
||||
|
||||
if feature_ncp < range_min:
|
||||
range_min = feature_ncp
|
||||
|
|
@ -1021,19 +988,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
feature_data[feature],
|
||||
total)
|
||||
if feature_ncp > 0:
|
||||
# divide by accuracy loss
|
||||
new_cells = copy.deepcopy(self.cells)
|
||||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
|
||||
cells_by_id)
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
||||
labels)) - current_accuracy
|
||||
feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
|
||||
feature_ncp, labels, current_accuracy)
|
||||
|
||||
if accuracy_gain < 0:
|
||||
accuracy_gain = 0
|
||||
if accuracy_gain != 0:
|
||||
feature_ncp = feature_ncp / accuracy_gain
|
||||
if feature_ncp < range_min:
|
||||
range_min = feature_ncp
|
||||
remove_feature = feature
|
||||
|
|
@ -1063,6 +1020,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
feature_ncp += cell_ncp
|
||||
return feature_ncp
|
||||
|
||||
def _normalize_ncp_by_accuracy_gain(self, original_data, prepared_data, nodes, feature, feature_ncp, labels,
|
||||
current_accuracy):
|
||||
new_cells = copy.deepcopy(self.cells)
|
||||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
|
||||
cells_by_id)
|
||||
accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder)
|
||||
accuracy_gain = accuracy - current_accuracy
|
||||
if accuracy_gain < 0:
|
||||
accuracy_gain = 0
|
||||
if accuracy_gain != 0:
|
||||
feature_ncp = feature_ncp / accuracy_gain
|
||||
return feature_ncp
|
||||
|
||||
def _calculate_generalizations(self, samples: Optional[pd.DataFrame] = None):
|
||||
ranges, range_representatives = self._calculate_ranges(self.cells)
|
||||
categories, category_representatives = self._calculate_categories(self.cells)
|
||||
|
|
@ -1282,3 +1254,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
for feature in to_remove:
|
||||
del generalizations['categories'][feature]
|
||||
|
||||
@staticmethod
|
||||
def _calculate_accuracy(generalized, y_test, estimator, encoder):
|
||||
generalized_data = encoder.transform(generalized) if encoder else generalized
|
||||
return estimator.score(ArrayDataset(generalized_data, y_test))
|
||||
|
|
|
|||
|
|
@ -368,7 +368,7 @@ class PyTorchClassifier(PyTorchModel):
|
|||
if validation_data is None:
|
||||
self._art_model.fit(
|
||||
x=train_data.get_samples(),
|
||||
y=train_data.get_labels().reshape(-1, 1),
|
||||
y=train_data.get_labels(),
|
||||
batch_size=batch_size,
|
||||
nb_epochs=nb_epochs,
|
||||
save_checkpoints=save_checkpoints,
|
||||
|
|
@ -379,9 +379,9 @@ class PyTorchClassifier(PyTorchModel):
|
|||
else:
|
||||
self._art_model.fit(
|
||||
x=train_data.get_samples(),
|
||||
y=train_data.get_labels().reshape(-1, 1),
|
||||
y=train_data.get_labels(),
|
||||
x_validation=validation_data.get_samples(),
|
||||
y_validation=validation_data.get_labels().reshape(-1, 1),
|
||||
y_validation=validation_data.get_labels(),
|
||||
batch_size=batch_size,
|
||||
nb_epochs=nb_epochs,
|
||||
save_checkpoints=save_checkpoints,
|
||||
|
|
|
|||
303
notebooks/anonymization_one_hot_adult.ipynb
Normal file
303
notebooks/anonymization_one_hot_adult.ipynb
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Using ML anonymization on one-hot encoded data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this tutorial we will show how to anonymize models using the ML anonymization module, specifically when the inout data is already one-hot encoded. \n",
|
||||
"\n",
|
||||
"This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/adult). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[['State-gov' 'Never-married' 'Adm-clerical' ... 'White' 'Male'\n",
|
||||
" 'UnitedStates']\n",
|
||||
" ['Self-emp-not-inc' 'Married-civ-spouse' 'Exec-managerial' ... 'White'\n",
|
||||
" 'Male' 'UnitedStates']\n",
|
||||
" ['Private' 'Divorced' 'Handlers-cleaners' ... 'White' 'Male'\n",
|
||||
" 'UnitedStates']\n",
|
||||
" ...\n",
|
||||
" ['Private' 'Never-married' 'Sales' ... 'White' 'Female' 'UnitedStates']\n",
|
||||
" ['Private' 'Never-married' 'Craft-repair' ... 'White' 'Male'\n",
|
||||
" 'UnitedStates']\n",
|
||||
" ['Private' 'Never-married' 'Handlers-cleaners' ... 'White' 'Male'\n",
|
||||
" 'UnitedStates']]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"sys.path.insert(0, os.path.abspath('..'))\n",
|
||||
"from apt.utils.dataset_utils import get_adult_dataset_pd\n",
|
||||
"\n",
|
||||
"# 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'\n",
|
||||
"categorical_features = [1, 3, 4, 5, 6, 7, 11]\n",
|
||||
"\n",
|
||||
"# requires a folder called 'datasets' in the current directory\n",
|
||||
"(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()\n",
|
||||
"x_train = x_train.to_numpy()[:, [1, 3, 4, 5, 6, 7, 11]]\n",
|
||||
"y_train = y_train.to_numpy().astype(int)\n",
|
||||
"x_test = x_test.to_numpy()[:, [1, 3, 4, 5, 6, 7, 11]]\n",
|
||||
"y_test = y_test.to_numpy().astype(int)\n",
|
||||
"\n",
|
||||
"# get balanced dataset\n",
|
||||
"x_train = x_train[:x_test.shape[0]]\n",
|
||||
"y_train = y_train[:y_test.shape[0]]\n",
|
||||
"\n",
|
||||
"print(x_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Encode data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[0 0 0 ... 0 1 0]\n",
|
||||
" [0 0 0 ... 0 1 0]\n",
|
||||
" [0 0 0 ... 0 1 0]\n",
|
||||
" ...\n",
|
||||
" [0 0 0 ... 0 1 0]\n",
|
||||
" [0 0 0 ... 0 1 0]\n",
|
||||
" [0 0 0 ... 0 1 0]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"import scipy\n",
|
||||
"\n",
|
||||
"preprocessor = OneHotEncoder(handle_unknown=\"ignore\")\n",
|
||||
"\n",
|
||||
"x_train = preprocessor.fit_transform(x_train)\n",
|
||||
"x_test = preprocessor.transform(x_test)\n",
|
||||
"if scipy.sparse.issparse(x_train):\n",
|
||||
" x_train = x_train.toarray().astype(int)\n",
|
||||
"if scipy.sparse.issparse(x_test):\n",
|
||||
" x_test = x_test.toarray().astype(int)\n",
|
||||
"\n",
|
||||
"print(x_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train decision tree model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Base model accuracy: 0.814446287083103\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
|
||||
" warnings.warn(msg, category=FutureWarning)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||
"from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier\n",
|
||||
"\n",
|
||||
"model = DecisionTreeClassifier()\n",
|
||||
"model.fit(x_train, y_train)\n",
|
||||
"\n",
|
||||
"art_classifier = ScikitlearnDecisionTreeClassifier(model)\n",
|
||||
"\n",
|
||||
"print('Base model accuracy: ', model.score(x_test, y_test))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Anonymize data\n",
|
||||
"## k=100\n",
|
||||
"\n",
|
||||
"The data is anonymized on the quasi-identifiers: age, education-num, capital-gain, hours-per-week and with a privact parameter k=100.\n",
|
||||
"\n",
|
||||
"This means that each record in the anonymized dataset is identical to 99 others on the quasi-identifier values (i.e., when looking only at those features, the records are indistinguishable)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[0 0 0 ... 0 1 0]\n",
|
||||
" [0 0 0 ... 0 1 0]\n",
|
||||
" [0 0 0 ... 0 1 0]\n",
|
||||
" ...\n",
|
||||
" [0 0 0 ... 0 1 0]\n",
|
||||
" [0 0 0 ... 0 1 0]\n",
|
||||
" [0 0 0 ... 0 1 0]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from apt.utils.datasets import ArrayDataset\n",
|
||||
"from apt.anonymization import Anonymize\n",
|
||||
"\n",
|
||||
"x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)])\n",
|
||||
"\n",
|
||||
"# QI = (race, sex)\n",
|
||||
"QI = [53, 52, 51, 50, 49, 48, 47]\n",
|
||||
"QI_slices = [[47, 48, 49, 50, 51], [52, 53]]\n",
|
||||
"anonymizer = Anonymize(100, QI)\n",
|
||||
"anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
|
||||
"print(anon)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"2711"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# number of distinct rows in original data\n",
|
||||
"len(np.unique(x_train, axis=0))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"2476"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# number of distinct rows in anonymized data\n",
|
||||
"len(np.unique(anon, axis=0))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train decision tree model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Anonymized model accuracy: 0.8135863890424421\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
|
||||
" warnings.warn(msg, category=FutureWarning)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"anon_model = DecisionTreeClassifier()\n",
|
||||
"anon_model.fit(anon, y_train)\n",
|
||||
"\n",
|
||||
"anon_art_classifier = ScikitlearnDecisionTreeClassifier(anon_model)\n",
|
||||
"\n",
|
||||
"print('Anonymized model accuracy: ', anon_model.score(x_test, y_test))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -9,7 +8,6 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -23,13 +21,72 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[ 39 13 2174 0 40]\n",
|
||||
" [ 50 13 0 0 13]\n",
|
||||
" [ 38 9 0 0 40]\n",
|
||||
" ...\n",
|
||||
" [ 27 13 0 0 40]\n",
|
||||
" [ 26 11 0 0 48]\n",
|
||||
" [ 27 9 0 0 40]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"sys.path.insert(0, os.path.abspath('..'))\n",
|
||||
"from apt.utils.dataset_utils import get_adult_dataset_pd\n",
|
||||
"\n",
|
||||
"# requires a folder called 'datasets' in the current directory\n",
|
||||
"(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()\n",
|
||||
"x_train = x_train.to_numpy()\n",
|
||||
"y_train = y_train.to_numpy().astype(int)\n",
|
||||
"x_test = x_test.to_numpy()\n",
|
||||
"y_test = y_test.to_numpy().astype(int)\n",
|
||||
"\n",
|
||||
"# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
|
||||
"x_train = x_train[:, [0, 2, 8, 9, 10]].astype(int)\n",
|
||||
"x_test = x_test[:, [0, 2, 8, 9, 10]].astype(int)\n",
|
||||
"\n",
|
||||
"# get balanced dataset\n",
|
||||
"x_train = x_train[:x_test.shape[0]]\n",
|
||||
"y_train = y_train[:y_test.shape[0]]\n",
|
||||
"\n",
|
||||
"print(x_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train decision tree model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
|
|
@ -39,76 +96,14 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[ 39. 13. 2174. 0. 40.]\n",
|
||||
" [ 50. 13. 0. 0. 13.]\n",
|
||||
" [ 38. 9. 0. 0. 40.]\n",
|
||||
" ...\n",
|
||||
" [ 27. 13. 0. 0. 40.]\n",
|
||||
" [ 26. 11. 0. 0. 48.]\n",
|
||||
" [ 27. 9. 0. 0. 40.]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
|
||||
"x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
|
||||
" usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n",
|
||||
"\n",
|
||||
"y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
|
||||
" usecols=14, dtype=str, delimiter=\", \")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
|
||||
" usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n",
|
||||
"\n",
|
||||
"y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
|
||||
" usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n",
|
||||
"\n",
|
||||
"# Trim trailing period \".\" from label\n",
|
||||
"y_test = np.array([a[:-1] for a in y_test])\n",
|
||||
"\n",
|
||||
"y_train[y_train == '<=50K'] = 0\n",
|
||||
"y_train[y_train == '>50K'] = 1\n",
|
||||
"y_train = y_train.astype(int)\n",
|
||||
"\n",
|
||||
"y_test[y_test == '<=50K'] = 0\n",
|
||||
"y_test[y_test == '>50K'] = 1\n",
|
||||
"y_test = y_test.astype(int)\n",
|
||||
"\n",
|
||||
"# get balanced dataset\n",
|
||||
"x_train = x_train[:x_test.shape[0]]\n",
|
||||
"y_train = y_train[:y_test.shape[0]]\n",
|
||||
"\n",
|
||||
"print(x_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train decision tree model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Base model accuracy: 0.8076285240464345\n"
|
||||
"Base model accuracy: 0.8087341072415699\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/mayaa/Development/GitHub/aiprivacy/ai-privacy-toolkit/venv1/lib/python3.8/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
|
||||
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
|
||||
" warnings.warn(msg, category=FutureWarning)\n"
|
||||
]
|
||||
}
|
||||
|
|
@ -122,13 +117,10 @@
|
|||
"\n",
|
||||
"art_classifier = ScikitlearnDecisionTreeClassifier(model)\n",
|
||||
"\n",
|
||||
"print('Base model accuracy: ', model.score(x_test, y_test))\n",
|
||||
"\n",
|
||||
"x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)]).reshape(-1,1)"
|
||||
"print('Base model accuracy: ', model.score(x_test, y_test))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -139,7 +131,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -159,7 +151,6 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -168,14 +159,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.5460017196904557\n"
|
||||
"0.5434836015231544\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -191,7 +182,6 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -199,7 +189,6 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -213,30 +202,29 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[38. 13. 0. 0. 40.]\n",
|
||||
" [46. 13. 0. 0. 35.]\n",
|
||||
" [28. 9. 0. 0. 40.]\n",
|
||||
"[[38 13 0 0 40]\n",
|
||||
" [46 13 0 0 35]\n",
|
||||
" [28 9 0 0 40]\n",
|
||||
" ...\n",
|
||||
" [26. 13. 0. 0. 40.]\n",
|
||||
" [27. 10. 0. 0. 50.]\n",
|
||||
" [28. 9. 0. 0. 40.]]\n"
|
||||
" [26 13 0 0 40]\n",
|
||||
" [27 10 0 0 50]\n",
|
||||
" [28 9 0 0 40]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"sys.path.insert(0, os.path.abspath('..'))\n",
|
||||
"from apt.utils.datasets import ArrayDataset\n",
|
||||
"from apt.anonymization import Anonymize\n",
|
||||
"\n",
|
||||
"x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)])\n",
|
||||
"\n",
|
||||
"# QI = (age, education-num, capital-gain, hours-per-week)\n",
|
||||
"QI = [0, 1, 2, 4]\n",
|
||||
"anonymizer = Anonymize(100, QI)\n",
|
||||
|
|
@ -246,7 +234,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -255,7 +243,7 @@
|
|||
"6739"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -267,7 +255,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -276,7 +264,7 @@
|
|||
"401"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -287,7 +275,6 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -296,21 +283,21 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Anonymized model accuracy: 0.826914808672686\n"
|
||||
"Anonymized model accuracy: 0.8308457711442786\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/mayaa/Development/GitHub/aiprivacy/ai-privacy-toolkit/venv1/lib/python3.8/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
|
||||
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
|
||||
" warnings.warn(msg, category=FutureWarning)\n"
|
||||
]
|
||||
}
|
||||
|
|
@ -325,7 +312,6 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -335,14 +321,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.49692912418621793\n"
|
||||
"0.4944724235351923\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -364,7 +350,6 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -380,8 +365,8 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(0.5316007088009451, 0.7738607050730868)\n",
|
||||
"(0.4971184877823882, 0.5297874953936863)\n"
|
||||
"without anonymization: (0.5303914835164835, 0.7588748311018303)\n",
|
||||
"with anonymization: (0.49255952380952384, 0.3659255619702739)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -411,15 +396,14 @@
|
|||
" return precision, recall\n",
|
||||
"\n",
|
||||
"# regular\n",
|
||||
"print(calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n",
|
||||
"print('without anonymization:', calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n",
|
||||
" np.concatenate((np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb))))))\n",
|
||||
"# anon\n",
|
||||
"print(calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n",
|
||||
"print('with anonymization:', calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n",
|
||||
" np.concatenate((np.ones(len(anon_inferred_train_bb)), np.zeros(len(anon_inferred_test_bb))))))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
|
|
@ -429,7 +413,7 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
|
@ -443,7 +427,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.9.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import pytest
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
|
@ -118,6 +119,74 @@ def test_regression():
|
|||
assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
|
||||
|
||||
|
||||
def test_anonymize_ndarray_one_hot():
|
||||
x_train = np.array([[23, 0, 1, 165],
|
||||
[45, 0, 1, 158],
|
||||
[56, 1, 0, 123],
|
||||
[67, 0, 1, 154],
|
||||
[45, 1, 0, 149],
|
||||
[42, 1, 0, 166],
|
||||
[73, 0, 1, 172],
|
||||
[94, 0, 1, 168],
|
||||
[69, 0, 1, 175],
|
||||
[24, 1, 0, 181],
|
||||
[18, 1, 0, 190]])
|
||||
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
pred = model.predict(x_train)
|
||||
|
||||
k = 10
|
||||
QI = [0, 1, 2]
|
||||
QI_slices = [[1, 2]]
|
||||
anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
assert (len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
|
||||
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
|
||||
assert (np.min(counts_elements) >= k)
|
||||
assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
|
||||
anonymized_slice = anon[:, QI_slices[0]]
|
||||
assert ((np.sum(anonymized_slice, axis=1) == 1).all())
|
||||
assert ((np.max(anonymized_slice, axis=1) == 1).all())
|
||||
assert ((np.min(anonymized_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_anonymize_pandas_one_hot():
|
||||
feature_names = ["age", "gender_M", "gender_F", "height"]
|
||||
x_train = np.array([[23, 0, 1, 165],
|
||||
[45, 0, 1, 158],
|
||||
[56, 1, 0, 123],
|
||||
[67, 0, 1, 154],
|
||||
[45, 1, 0, 149],
|
||||
[42, 1, 0, 166],
|
||||
[73, 0, 1, 172],
|
||||
[94, 0, 1, 168],
|
||||
[69, 0, 1, 175],
|
||||
[24, 1, 0, 181],
|
||||
[18, 1, 0, 190]])
|
||||
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
x_train = pd.DataFrame(x_train, columns=feature_names)
|
||||
y_train = pd.Series(y_train)
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
pred = model.predict(x_train)
|
||||
|
||||
k = 10
|
||||
QI = ["age", "gender_M", "gender_F"]
|
||||
QI_slices = [["gender_M", "gender_F"]]
|
||||
anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
assert (anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
|
||||
anonymized_slice = anon.loc[:, QI_slices[0]]
|
||||
assert ((np.sum(anonymized_slice, axis=1) == 1).all())
|
||||
assert ((np.max(anonymized_slice, axis=1) == 1).all())
|
||||
assert ((np.min(anonymized_slice, axis=1) == 0).all())
|
||||
|
||||
|
||||
def test_errors():
|
||||
with pytest.raises(ValueError):
|
||||
Anonymize(1, [0, 2])
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@ from sklearn.model_selection import train_test_split
|
|||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
|
||||
from torch import nn, optim
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.layers import Dense, Input
|
||||
|
|
@ -24,6 +26,9 @@ from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegresso
|
|||
tf.compat.v1.disable_eager_execution()
|
||||
|
||||
|
||||
ACCURACY_DIFF = 0.05
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def diabetes_dataset():
|
||||
return load_diabetes()
|
||||
|
|
@ -286,7 +291,7 @@ def test_minimizer_fit(data_two_features):
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_minimizer_ncp(data_two_features):
|
||||
|
|
@ -348,12 +353,15 @@ def test_minimizer_ncp_categorical(data_four_features):
|
|||
train_dataset = ArrayDataset(x, predictions, features_names=features)
|
||||
|
||||
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, generalize_using_transform=False)
|
||||
categorical_features=categorical_features,
|
||||
generalize_using_transform=False,
|
||||
encoder=preprocessor)
|
||||
gen1.fit(dataset=train_dataset)
|
||||
ncp1 = gen1.ncp.fit_score
|
||||
ncp2 = gen1.calculate_ncp(ad1)
|
||||
|
||||
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features)
|
||||
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features,
|
||||
encoder=preprocessor)
|
||||
gen2.fit(dataset=train_dataset)
|
||||
ncp3 = gen2.ncp.fit_score
|
||||
gen2.transform(dataset=ad1)
|
||||
|
|
@ -414,7 +422,8 @@ def test_minimizer_fit_pandas(data_four_features):
|
|||
# Now we have a full prediction pipeline.
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features)
|
||||
categorical_features=categorical_features,
|
||||
encoder=preprocessor)
|
||||
train_dataset = ArrayDataset(x, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(x))
|
||||
|
|
@ -428,7 +437,7 @@ def test_minimizer_fit_pandas(data_four_features):
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_minimizer_params_categorical(cells_categorical):
|
||||
|
|
@ -450,13 +459,14 @@ def test_minimizer_params_categorical(cells_categorical):
|
|||
# Now we have a full prediction pipeline.
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, cells=cells)
|
||||
categorical_features=categorical_features, cells=cells,
|
||||
encoder=preprocessor)
|
||||
train_dataset = ArrayDataset(x, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(x))
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_minimizer_fit_qi(data_three_features):
|
||||
|
|
@ -484,7 +494,7 @@ def test_minimizer_fit_qi(data_three_features):
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_minimizer_fit_pandas_qi(data_five_features):
|
||||
|
|
@ -508,7 +518,8 @@ def test_minimizer_fit_pandas_qi(data_five_features):
|
|||
# Now we have a full prediction pipeline.
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, features_to_minimize=qi)
|
||||
categorical_features=categorical_features, features_to_minimize=qi,
|
||||
encoder=preprocessor)
|
||||
train_dataset = ArrayDataset(x, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(x))
|
||||
|
|
@ -523,7 +534,7 @@ def test_minimizer_fit_pandas_qi(data_five_features):
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_minimize_ndarray_iris():
|
||||
|
|
@ -552,7 +563,7 @@ def test_minimize_ndarray_iris():
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_minimize_pandas_adult():
|
||||
|
|
@ -582,7 +593,8 @@ def test_minimize_pandas_adult():
|
|||
predictions = np.argmax(predictions, axis=1)
|
||||
target_accuracy = 0.7
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, features_to_minimize=qi)
|
||||
categorical_features=categorical_features, features_to_minimize=qi,
|
||||
encoder=preprocessor)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations
|
||||
|
|
@ -609,7 +621,7 @@ def test_minimize_pandas_adult():
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_german_credit_pandas():
|
||||
|
|
@ -637,7 +649,8 @@ def test_german_credit_pandas():
|
|||
predictions = np.argmax(predictions, axis=1)
|
||||
target_accuracy = 0.7
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, features_to_minimize=qi)
|
||||
categorical_features=categorical_features, features_to_minimize=qi,
|
||||
encoder=preprocessor)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations
|
||||
|
|
@ -666,7 +679,7 @@ def test_german_credit_pandas():
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_regression(diabetes_dataset):
|
||||
|
|
@ -726,7 +739,7 @@ def test_regression(diabetes_dataset):
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_x_y():
|
||||
|
|
@ -766,7 +779,7 @@ def test_x_y():
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_x_y_features_names():
|
||||
|
|
@ -806,7 +819,7 @@ def test_x_y_features_names():
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_BaseEstimator_classification(data_five_features):
|
||||
|
|
@ -828,7 +841,8 @@ def test_BaseEstimator_classification(data_five_features):
|
|||
# Now we have a full prediction pipeline.
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, features_to_minimize=QI)
|
||||
categorical_features=categorical_features, features_to_minimize=QI,
|
||||
encoder=preprocessor)
|
||||
train_dataset = ArrayDataset(x, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(x))
|
||||
|
|
@ -844,7 +858,7 @@ def test_BaseEstimator_classification(data_five_features):
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_BaseEstimator_regression(diabetes_dataset):
|
||||
|
|
@ -903,7 +917,7 @@ def test_BaseEstimator_regression(diabetes_dataset):
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_keras_model():
|
||||
|
|
@ -936,7 +950,39 @@ def test_keras_model():
|
|||
check_ncp(ncp, gener)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
class PytorchModel(nn.Module):
|
||||
|
||||
def __init__(self, num_classes, num_features):
|
||||
super(PytorchModel, self).__init__()
|
||||
|
||||
self.fc1 = nn.Sequential(
|
||||
nn.Linear(num_features, 1024),
|
||||
nn.Tanh(), )
|
||||
|
||||
self.fc2 = nn.Sequential(
|
||||
nn.Linear(1024, 512),
|
||||
nn.Tanh(), )
|
||||
|
||||
self.fc3 = nn.Sequential(
|
||||
nn.Linear(512, 256),
|
||||
nn.Tanh(), )
|
||||
|
||||
self.fc4 = nn.Sequential(
|
||||
nn.Linear(256, 128),
|
||||
nn.Tanh(),
|
||||
)
|
||||
|
||||
self.classifier = nn.Linear(128, num_classes)
|
||||
|
||||
def forward(self, x):
|
||||
out = self.fc1(x)
|
||||
out = self.fc2(out)
|
||||
out = self.fc3(out)
|
||||
out = self.fc4(out)
|
||||
return self.classifier(out)
|
||||
|
||||
|
||||
def test_minimizer_pytorch(data_three_features):
|
||||
|
|
@ -944,49 +990,17 @@ def test_minimizer_pytorch(data_three_features):
|
|||
x = x.astype(np.float32)
|
||||
qi = ['age', 'weight']
|
||||
|
||||
from torch import nn, optim
|
||||
from apt.utils.datasets.datasets import PytorchData
|
||||
from apt.utils.models.pytorch_model import PyTorchClassifier
|
||||
|
||||
class pytorch_model(nn.Module):
|
||||
|
||||
def __init__(self, num_classes, num_features):
|
||||
super(pytorch_model, self).__init__()
|
||||
|
||||
self.fc1 = nn.Sequential(
|
||||
nn.Linear(num_features, 1024),
|
||||
nn.Tanh(), )
|
||||
|
||||
self.fc2 = nn.Sequential(
|
||||
nn.Linear(1024, 512),
|
||||
nn.Tanh(), )
|
||||
|
||||
self.fc3 = nn.Sequential(
|
||||
nn.Linear(512, 256),
|
||||
nn.Tanh(), )
|
||||
|
||||
self.fc4 = nn.Sequential(
|
||||
nn.Linear(256, 128),
|
||||
nn.Tanh(),
|
||||
)
|
||||
|
||||
self.classifier = nn.Linear(128, num_classes)
|
||||
|
||||
def forward(self, x):
|
||||
out = self.fc1(x)
|
||||
out = self.fc2(out)
|
||||
out = self.fc3(out)
|
||||
out = self.fc4(out)
|
||||
return self.classifier(out)
|
||||
|
||||
base_est = pytorch_model(2, 3)
|
||||
base_est = PytorchModel(2, 3)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = optim.Adam(base_est.parameters(), lr=0.01)
|
||||
|
||||
model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
|
||||
optimizer=optimizer, input_shape=(3,),
|
||||
nb_classes=2)
|
||||
model.fit(PytorchData(x.astype(np.float32), y), save_entire_model=False, nb_epochs=10)
|
||||
model.fit(PytorchData(x, y), save_entire_model=False, nb_epochs=10)
|
||||
|
||||
ad = ArrayDataset(x)
|
||||
predictions = model.predict(ad)
|
||||
|
|
@ -1006,7 +1020,41 @@ def test_minimizer_pytorch(data_three_features):
|
|||
check_ncp(ncp, expected_generalizations)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_minimizer_pytorch_iris():
|
||||
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
|
||||
(x_train, y_train), _ = get_iris_dataset_np()
|
||||
x_train = x_train.astype(np.float32)
|
||||
qi = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
|
||||
|
||||
from apt.utils.datasets.datasets import PytorchData
|
||||
from apt.utils.models.pytorch_model import PyTorchClassifier
|
||||
|
||||
base_est = PytorchModel(3, 4)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = optim.Adam(base_est.parameters(), lr=0.01)
|
||||
|
||||
model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
|
||||
optimizer=optimizer, input_shape=(4,),
|
||||
nb_classes=3)
|
||||
model.fit(PytorchData(x_train, y_train), save_entire_model=False, nb_epochs=10)
|
||||
|
||||
predictions = model.predict(ArrayDataset(x_train))
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
target_accuracy = 0.99
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
|
||||
transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
gener = gen.generalizations
|
||||
|
||||
check_features(features, gener, transformed, x_train)
|
||||
ncp = gen.ncp.transform_score
|
||||
check_ncp(ncp, gener)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
|
||||
|
||||
|
||||
def test_untouched():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue