Support 1-hot encoded features in anonymization + fixes related to encoding in minimization (#86)

* Support 1-hot encoded features in anonymization (#72)
* Fix anonymization adult notebook + new notebook to demonstrate anonymization on 1-hot encoded data

* Minimizer: No default encoder, if none provided data is supplied to the model as is. Fix data type of representative values. Fix and add more tests.

Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
abigailgold 2023-10-19 11:48:15 +03:00 committed by GitHub
parent 26addd192f
commit 5dce961092
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 670 additions and 255 deletions

View file

@ -23,7 +23,11 @@ class Anonymize:
:type k: int
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
:type quasi_identifiers: np.ndarray or list
:type quasi_identifiers: np.ndarray or list of strings or integers.
:param quasi_identifer_slices: If some of the quasi-identifiers represent 1-hot encoded features that need to remain
consistent after anonymization, provide a list containing the list of column names
or indexes that represent a single feature.
:type quasi_identifer_slices: list of lists of strings or integers.
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
before using them to train the decision tree model).
:type categorical_features: list, optional
@ -35,8 +39,12 @@ class Anonymize:
:type train_only_QI: boolean, optional
"""
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
def __init__(self, k: int,
quasi_identifiers: Union[np.ndarray, list],
quasi_identifer_slices: Optional[list] = None,
categorical_features: Optional[list] = None,
is_regression: Optional[bool] = False,
train_only_QI: Optional[bool] = False):
if k < 2:
raise ValueError("k should be a positive integer with a value of 2 or higher")
if quasi_identifiers is None or len(quasi_identifiers) < 1:
@ -49,6 +57,7 @@ class Anonymize:
self.train_only_QI = train_only_QI
self.features_names = None
self.features = None
self.quasi_identifer_slices = quasi_identifer_slices
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
"""
@ -76,7 +85,14 @@ class Anonymize:
if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
'the data columns')
# transform quasi identifiers to indexes
self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
if self.quasi_identifer_slices:
temp_list = []
for slice in self.quasi_identifer_slices:
new_slice = [i for i, v in enumerate(self.features_names) if v in slice]
temp_list.append(new_slice)
self.quasi_identifer_slices = temp_list
if self.categorical_features:
self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]
@ -126,31 +142,49 @@ class Anonymize:
return cells_by_id
def _find_representatives(self, x, x_anonymizer_train, cells):
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
# x is original data (always numpy), x_anonymizer_train is only QIs + 1-hot encoded
node_ids = self._find_sample_nodes(x_anonymizer_train)
if self.quasi_identifer_slices:
all_one_hot_features = set([feature for encoded in self.quasi_identifer_slices for feature in encoded])
else:
all_one_hot_features = set()
for cell in cells:
cell['representative'] = {}
# get all rows in cell
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
# TODO: should we filter only those with majority label? (using hist)
rows = x[indexes]
for feature in self.quasi_identifiers:
values = rows[:, feature]
if self.categorical_features and feature in self.categorical_features:
# find most common value
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
else:
# find the mean value (per feature)
median = np.median(values)
min_value = max(values)
min_dist = float("inf")
for value in values:
# euclidean distance between two floating point values
dist = abs(value - median)
if dist < min_dist:
min_dist = dist
min_value = value
cell['representative'][feature] = min_value
done = set()
for feature in self.quasi_identifiers: # self.quasi_identifiers are numerical indexes
if feature not in done:
# deal with 1-hot encoded features
if feature in all_one_hot_features:
# find features that belong together
for encoded in self.quasi_identifer_slices:
if feature in encoded:
values = rows[:, encoded]
unique_rows, counts = np.unique(values, axis=0, return_counts=True)
rep = unique_rows[np.argmax(counts)]
for i, e in enumerate(encoded):
done.add(e)
cell['representative'][e] = rep[i]
else: # rest of features
values = rows[:, feature]
if self.categorical_features and feature in self.categorical_features:
# find most common value
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
else:
# find the mean value (per feature)
median = np.median(values)
min_value = max(values)
min_dist = float("inf")
for value in values:
# euclidean distance between two floating point values
dist = abs(value - median)
if dist < min_dist:
min_dist = dist
min_value = value
cell['representative'][feature] = min_value
def _find_sample_nodes(self, samples):
paths = self._anonymizer.decision_path(samples).toarray()

View file

@ -10,9 +10,6 @@ import copy
import sys
from scipy.spatial import distance
from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.utils.validation import check_is_fitted
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
@ -57,7 +54,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
encoded before using them to train the decision tree model).
:param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
features)
features). If not provided, the data will be fed as is directly to the estimator.
:type encoder: sklearn OrdinalEncoder or OneHotEncoder
:type categorical_features: list of strings, optional
:param features_to_minimize: The features to be minimized.
@ -256,7 +253,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# Going to fit
# (currently not dealing with option to fit with only X and y and no estimator)
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
dtype = dataset.get_samples().dtype
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
if not self.features_to_minimize:
self.features_to_minimize = self._features
@ -293,21 +289,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# collect feature data (such as min, max)
self._feature_data = self._get_feature_data(x)
# default encoder in case none provided
if self.encoder is None:
numeric_features = [f for f in self._features if f not in self.categorical_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
self.encoder = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, self.categorical_features),
]
)
self.encoder.fit(x)
self.cells = []
self._categorical_values = {}
@ -334,14 +315,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
# self._cells currently holds the generalization created from the tree leaves
self._calculate_generalizations(x_test)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
generalized = self._generalize(x_test, x_prepared_test, nodes)
# check accuracy
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype), y_test))
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
print('Initial accuracy of model on generalized data, relative to original model predictions '
'(base generalization derived from tree, before improvements): %f' % accuracy)
@ -364,15 +341,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
self._calculate_generalizations(x_test)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
y_test))
generalized = self._generalize(x_test, x_prepared_test, nodes)
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
# if accuracy passed threshold roll back to previous iteration generalizations
if accuracy < self.target_accuracy:
self.cells = cells_previous_iter
@ -395,14 +365,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if removed_feature is None:
break
self._calculate_generalizations(x_test)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized).astype(dtype),
y_test))
generalized = self._generalize(x_test, x_prepared_test, nodes)
accuracy = self._calculate_accuracy(generalized, y_test, self.estimator, self.encoder)
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self._cells currently holds the chosen generalization based on target accuracy
@ -893,7 +857,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def _generalize_indexes(self, original_data, cells, all_indexes):
# prepared data include one hot encoded categorical data + QI
representatives = pd.DataFrame(columns=self._features) # empty except for columns
dtypes = original_data.dtypes.to_dict()
new_dtypes = {}
for t in dtypes.keys():
new_dtypes[t] = pd.Series(dtype=dtypes[t].name)
representatives = pd.DataFrame(new_dtypes) # empty except for columns
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
# iterate over cells (leaves in decision tree)
@ -925,6 +893,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return original_data_generalized
def _generalize(self, data, data_prepared, nodes):
self._calculate_generalizations(data)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(data, data_prepared, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(data, self.generalizations)
return generalized
@staticmethod
def _map_to_ranges_categories(samples, ranges, categories):
all_sample_indexes = []
@ -994,18 +971,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
feature_data[feature],
total)
if feature_ncp > 0:
# divide by accuracy gain
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
labels)) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
feature_ncp = feature_ncp / accuracy_gain
feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
feature_ncp, labels, current_accuracy)
if feature_ncp < range_min:
range_min = feature_ncp
@ -1021,19 +988,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
feature_data[feature],
total)
if feature_ncp > 0:
# divide by accuracy loss
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
labels)) - current_accuracy
feature_ncp = self._normalize_ncp_by_accuracy_gain(original_data, prepared_data, nodes, feature,
feature_ncp, labels, current_accuracy)
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
feature_ncp = feature_ncp / accuracy_gain
if feature_ncp < range_min:
range_min = feature_ncp
remove_feature = feature
@ -1063,6 +1020,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
feature_ncp += cell_ncp
return feature_ncp
def _normalize_ncp_by_accuracy_gain(self, original_data, prepared_data, nodes, feature, feature_ncp, labels,
current_accuracy):
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
cells_by_id)
accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder)
accuracy_gain = accuracy - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
feature_ncp = feature_ncp / accuracy_gain
return feature_ncp
def _calculate_generalizations(self, samples: Optional[pd.DataFrame] = None):
ranges, range_representatives = self._calculate_ranges(self.cells)
categories, category_representatives = self._calculate_categories(self.cells)
@ -1282,3 +1254,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
for feature in to_remove:
del generalizations['categories'][feature]
@staticmethod
def _calculate_accuracy(generalized, y_test, estimator, encoder):
generalized_data = encoder.transform(generalized) if encoder else generalized
return estimator.score(ArrayDataset(generalized_data, y_test))

View file

@ -368,7 +368,7 @@ class PyTorchClassifier(PyTorchModel):
if validation_data is None:
self._art_model.fit(
x=train_data.get_samples(),
y=train_data.get_labels().reshape(-1, 1),
y=train_data.get_labels(),
batch_size=batch_size,
nb_epochs=nb_epochs,
save_checkpoints=save_checkpoints,
@ -379,9 +379,9 @@ class PyTorchClassifier(PyTorchModel):
else:
self._art_model.fit(
x=train_data.get_samples(),
y=train_data.get_labels().reshape(-1, 1),
y=train_data.get_labels(),
x_validation=validation_data.get_samples(),
y_validation=validation_data.get_labels().reshape(-1, 1),
y_validation=validation_data.get_labels(),
batch_size=batch_size,
nb_epochs=nb_epochs,
save_checkpoints=save_checkpoints,

View file

@ -0,0 +1,303 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Using ML anonymization on one-hot encoded data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this tutorial we will show how to anonymize models using the ML anonymization module, specifically when the inout data is already one-hot encoded. \n",
"\n",
"This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/adult). "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['State-gov' 'Never-married' 'Adm-clerical' ... 'White' 'Male'\n",
" 'UnitedStates']\n",
" ['Self-emp-not-inc' 'Married-civ-spouse' 'Exec-managerial' ... 'White'\n",
" 'Male' 'UnitedStates']\n",
" ['Private' 'Divorced' 'Handlers-cleaners' ... 'White' 'Male'\n",
" 'UnitedStates']\n",
" ...\n",
" ['Private' 'Never-married' 'Sales' ... 'White' 'Female' 'UnitedStates']\n",
" ['Private' 'Never-married' 'Craft-repair' ... 'White' 'Male'\n",
" 'UnitedStates']\n",
" ['Private' 'Never-married' 'Handlers-cleaners' ... 'White' 'Male'\n",
" 'UnitedStates']]\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"from apt.utils.dataset_utils import get_adult_dataset_pd\n",
"\n",
"# 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'\n",
"categorical_features = [1, 3, 4, 5, 6, 7, 11]\n",
"\n",
"# requires a folder called 'datasets' in the current directory\n",
"(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()\n",
"x_train = x_train.to_numpy()[:, [1, 3, 4, 5, 6, 7, 11]]\n",
"y_train = y_train.to_numpy().astype(int)\n",
"x_test = x_test.to_numpy()[:, [1, 3, 4, 5, 6, 7, 11]]\n",
"y_test = y_test.to_numpy().astype(int)\n",
"\n",
"# get balanced dataset\n",
"x_train = x_train[:x_test.shape[0]]\n",
"y_train = y_train[:y_test.shape[0]]\n",
"\n",
"print(x_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Encode data"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 0 0 ... 0 1 0]\n",
" [0 0 0 ... 0 1 0]\n",
" [0 0 0 ... 0 1 0]\n",
" ...\n",
" [0 0 0 ... 0 1 0]\n",
" [0 0 0 ... 0 1 0]\n",
" [0 0 0 ... 0 1 0]]\n"
]
}
],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"import scipy\n",
"\n",
"preprocessor = OneHotEncoder(handle_unknown=\"ignore\")\n",
"\n",
"x_train = preprocessor.fit_transform(x_train)\n",
"x_test = preprocessor.transform(x_test)\n",
"if scipy.sparse.issparse(x_train):\n",
" x_train = x_train.toarray().astype(int)\n",
"if scipy.sparse.issparse(x_test):\n",
" x_test = x_test.toarray().astype(int)\n",
"\n",
"print(x_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train decision tree model"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base model accuracy: 0.814446287083103\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
" warnings.warn(msg, category=FutureWarning)\n"
]
}
],
"source": [
"from sklearn.tree import DecisionTreeClassifier\n",
"from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier\n",
"\n",
"model = DecisionTreeClassifier()\n",
"model.fit(x_train, y_train)\n",
"\n",
"art_classifier = ScikitlearnDecisionTreeClassifier(model)\n",
"\n",
"print('Base model accuracy: ', model.score(x_test, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Anonymize data\n",
"## k=100\n",
"\n",
"The data is anonymized on the quasi-identifiers: age, education-num, capital-gain, hours-per-week and with a privact parameter k=100.\n",
"\n",
"This means that each record in the anonymized dataset is identical to 99 others on the quasi-identifier values (i.e., when looking only at those features, the records are indistinguishable)."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 0 0 ... 0 1 0]\n",
" [0 0 0 ... 0 1 0]\n",
" [0 0 0 ... 0 1 0]\n",
" ...\n",
" [0 0 0 ... 0 1 0]\n",
" [0 0 0 ... 0 1 0]\n",
" [0 0 0 ... 0 1 0]]\n"
]
}
],
"source": [
"from apt.utils.datasets import ArrayDataset\n",
"from apt.anonymization import Anonymize\n",
"\n",
"x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)])\n",
"\n",
"# QI = (race, sex)\n",
"QI = [53, 52, 51, 50, 49, 48, 47]\n",
"QI_slices = [[47, 48, 49, 50, 51], [52, 53]]\n",
"anonymizer = Anonymize(100, QI)\n",
"anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
"print(anon)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2711"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# number of distinct rows in original data\n",
"len(np.unique(x_train, axis=0))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2476"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# number of distinct rows in anonymized data\n",
"len(np.unique(anon, axis=0))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train decision tree model"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Anonymized model accuracy: 0.8135863890424421\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
" warnings.warn(msg, category=FutureWarning)\n"
]
}
],
"source": [
"anon_model = DecisionTreeClassifier()\n",
"anon_model.fit(anon, y_train)\n",
"\n",
"anon_art_classifier = ScikitlearnDecisionTreeClassifier(anon_model)\n",
"\n",
"print('Anonymized model accuracy: ', anon_model.score(x_test, y_test))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -1,7 +1,6 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -9,7 +8,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -23,13 +21,72 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 39 13 2174 0 40]\n",
" [ 50 13 0 0 13]\n",
" [ 38 9 0 0 40]\n",
" ...\n",
" [ 27 13 0 0 40]\n",
" [ 26 11 0 0 48]\n",
" [ 27 9 0 0 40]]\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"from apt.utils.dataset_utils import get_adult_dataset_pd\n",
"\n",
"# requires a folder called 'datasets' in the current directory\n",
"(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()\n",
"x_train = x_train.to_numpy()\n",
"y_train = y_train.to_numpy().astype(int)\n",
"x_test = x_test.to_numpy()\n",
"y_test = y_test.to_numpy().astype(int)\n",
"\n",
"# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
"x_train = x_train[:, [0, 2, 8, 9, 10]].astype(int)\n",
"x_test = x_test[:, [0, 2, 8, 9, 10]].astype(int)\n",
"\n",
"# get balanced dataset\n",
"x_train = x_train[:x_test.shape[0]]\n",
"y_train = y_train[:y_test.shape[0]]\n",
"\n",
"print(x_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train decision tree model"
]
},
{
"cell_type": "code",
"execution_count": 3,
@ -39,76 +96,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 39. 13. 2174. 0. 40.]\n",
" [ 50. 13. 0. 0. 13.]\n",
" [ 38. 9. 0. 0. 40.]\n",
" ...\n",
" [ 27. 13. 0. 0. 40.]\n",
" [ 26. 11. 0. 0. 48.]\n",
" [ 27. 9. 0. 0. 40.]]\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
"x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
" usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n",
"\n",
"y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
" usecols=14, dtype=str, delimiter=\", \")\n",
"\n",
"\n",
"x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
" usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n",
"\n",
"y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
" usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n",
"\n",
"# Trim trailing period \".\" from label\n",
"y_test = np.array([a[:-1] for a in y_test])\n",
"\n",
"y_train[y_train == '<=50K'] = 0\n",
"y_train[y_train == '>50K'] = 1\n",
"y_train = y_train.astype(int)\n",
"\n",
"y_test[y_test == '<=50K'] = 0\n",
"y_test[y_test == '>50K'] = 1\n",
"y_test = y_test.astype(int)\n",
"\n",
"# get balanced dataset\n",
"x_train = x_train[:x_test.shape[0]]\n",
"y_train = y_train[:y_test.shape[0]]\n",
"\n",
"print(x_train)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train decision tree model"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base model accuracy: 0.8076285240464345\n"
"Base model accuracy: 0.8087341072415699\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mayaa/Development/GitHub/aiprivacy/ai-privacy-toolkit/venv1/lib/python3.8/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
" warnings.warn(msg, category=FutureWarning)\n"
]
}
@ -122,13 +117,10 @@
"\n",
"art_classifier = ScikitlearnDecisionTreeClassifier(model)\n",
"\n",
"print('Base model accuracy: ', model.score(x_test, y_test))\n",
"\n",
"x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)]).reshape(-1,1)"
"print('Base model accuracy: ', model.score(x_test, y_test))"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -139,7 +131,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -159,7 +151,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -168,14 +159,14 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5460017196904557\n"
"0.5434836015231544\n"
]
}
],
@ -191,7 +182,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -199,7 +189,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -213,30 +202,29 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[38. 13. 0. 0. 40.]\n",
" [46. 13. 0. 0. 35.]\n",
" [28. 9. 0. 0. 40.]\n",
"[[38 13 0 0 40]\n",
" [46 13 0 0 35]\n",
" [28 9 0 0 40]\n",
" ...\n",
" [26. 13. 0. 0. 40.]\n",
" [27. 10. 0. 0. 50.]\n",
" [28. 9. 0. 0. 40.]]\n"
" [26 13 0 0 40]\n",
" [27 10 0 0 50]\n",
" [28 9 0 0 40]]\n"
]
}
],
"source": [
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"from apt.utils.datasets import ArrayDataset\n",
"from apt.anonymization import Anonymize\n",
"\n",
"x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)])\n",
"\n",
"# QI = (age, education-num, capital-gain, hours-per-week)\n",
"QI = [0, 1, 2, 4]\n",
"anonymizer = Anonymize(100, QI)\n",
@ -246,7 +234,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -255,7 +243,7 @@
"6739"
]
},
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -267,7 +255,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -276,7 +264,7 @@
"401"
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -287,7 +275,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -296,21 +283,21 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Anonymized model accuracy: 0.826914808672686\n"
"Anonymized model accuracy: 0.8308457711442786\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mayaa/Development/GitHub/aiprivacy/ai-privacy-toolkit/venv1/lib/python3.8/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
"/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
" warnings.warn(msg, category=FutureWarning)\n"
]
}
@ -325,7 +312,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -335,14 +321,14 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.49692912418621793\n"
"0.4944724235351923\n"
]
}
],
@ -364,7 +350,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -380,8 +365,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"(0.5316007088009451, 0.7738607050730868)\n",
"(0.4971184877823882, 0.5297874953936863)\n"
"without anonymization: (0.5303914835164835, 0.7588748311018303)\n",
"with anonymization: (0.49255952380952384, 0.3659255619702739)\n"
]
}
],
@ -411,15 +396,14 @@
" return precision, recall\n",
"\n",
"# regular\n",
"print(calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n",
"print('without anonymization:', calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n",
" np.concatenate((np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb))))))\n",
"# anon\n",
"print(calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n",
"print('with anonymization:', calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n",
" np.concatenate((np.ones(len(anon_inferred_train_bb)), np.zeros(len(anon_inferred_test_bb))))))"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@ -429,7 +413,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -443,7 +427,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.9.6"
}
},
"nbformat": 4,

View file

@ -1,5 +1,6 @@
import pytest
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
@ -118,6 +119,74 @@ def test_regression():
assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
def test_anonymize_ndarray_one_hot():
x_train = np.array([[23, 0, 1, 165],
[45, 0, 1, 158],
[56, 1, 0, 123],
[67, 0, 1, 154],
[45, 1, 0, 149],
[42, 1, 0, 166],
[73, 0, 1, 172],
[94, 0, 1, 168],
[69, 0, 1, 175],
[24, 1, 0, 181],
[18, 1, 0, 190]])
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_train)
k = 10
QI = [0, 1, 2]
QI_slices = [[1, 2]]
anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
assert (len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
assert (np.min(counts_elements) >= k)
assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
anonymized_slice = anon[:, QI_slices[0]]
assert ((np.sum(anonymized_slice, axis=1) == 1).all())
assert ((np.max(anonymized_slice, axis=1) == 1).all())
assert ((np.min(anonymized_slice, axis=1) == 0).all())
def test_anonymize_pandas_one_hot():
feature_names = ["age", "gender_M", "gender_F", "height"]
x_train = np.array([[23, 0, 1, 165],
[45, 0, 1, 158],
[56, 1, 0, 123],
[67, 0, 1, 154],
[45, 1, 0, 149],
[42, 1, 0, 166],
[73, 0, 1, 172],
[94, 0, 1, 168],
[69, 0, 1, 175],
[24, 1, 0, 181],
[18, 1, 0, 190]])
y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
x_train = pd.DataFrame(x_train, columns=feature_names)
y_train = pd.Series(y_train)
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_train)
k = 10
QI = ["age", "gender_M", "gender_F"]
QI_slices = [["gender_M", "gender_F"]]
anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
assert (anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
assert (anon.loc[:, QI].value_counts().min() >= k)
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
anonymized_slice = anon.loc[:, QI_slices[0]]
assert ((np.sum(anonymized_slice, axis=1) == 1).all())
assert ((np.max(anonymized_slice, axis=1) == 1).all())
assert ((np.min(anonymized_slice, axis=1) == 0).all())
def test_errors():
with pytest.raises(ValueError):
Anonymize(1, [0, 2])

View file

@ -11,6 +11,8 @@ from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from torch import nn, optim
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
@ -24,6 +26,9 @@ from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegresso
tf.compat.v1.disable_eager_execution()
ACCURACY_DIFF = 0.05
@pytest.fixture
def diabetes_dataset():
return load_diabetes()
@ -286,7 +291,7 @@ def test_minimizer_fit(data_two_features):
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_minimizer_ncp(data_two_features):
@ -348,12 +353,15 @@ def test_minimizer_ncp_categorical(data_four_features):
train_dataset = ArrayDataset(x, predictions, features_names=features)
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, generalize_using_transform=False)
categorical_features=categorical_features,
generalize_using_transform=False,
encoder=preprocessor)
gen1.fit(dataset=train_dataset)
ncp1 = gen1.ncp.fit_score
ncp2 = gen1.calculate_ncp(ad1)
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features)
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features,
encoder=preprocessor)
gen2.fit(dataset=train_dataset)
ncp3 = gen2.ncp.fit_score
gen2.transform(dataset=ad1)
@ -414,7 +422,8 @@ def test_minimizer_fit_pandas(data_four_features):
# Now we have a full prediction pipeline.
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features)
categorical_features=categorical_features,
encoder=preprocessor)
train_dataset = ArrayDataset(x, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(x))
@ -428,7 +437,7 @@ def test_minimizer_fit_pandas(data_four_features):
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_minimizer_params_categorical(cells_categorical):
@ -450,13 +459,14 @@ def test_minimizer_params_categorical(cells_categorical):
# Now we have a full prediction pipeline.
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, cells=cells)
categorical_features=categorical_features, cells=cells,
encoder=preprocessor)
train_dataset = ArrayDataset(x, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(x))
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_minimizer_fit_qi(data_three_features):
@ -484,7 +494,7 @@ def test_minimizer_fit_qi(data_three_features):
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_minimizer_fit_pandas_qi(data_five_features):
@ -508,7 +518,8 @@ def test_minimizer_fit_pandas_qi(data_five_features):
# Now we have a full prediction pipeline.
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=qi)
categorical_features=categorical_features, features_to_minimize=qi,
encoder=preprocessor)
train_dataset = ArrayDataset(x, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(x))
@ -523,7 +534,7 @@ def test_minimizer_fit_pandas_qi(data_five_features):
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_minimize_ndarray_iris():
@ -552,7 +563,7 @@ def test_minimize_ndarray_iris():
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_minimize_pandas_adult():
@ -582,7 +593,8 @@ def test_minimize_pandas_adult():
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=qi)
categorical_features=categorical_features, features_to_minimize=qi,
encoder=preprocessor)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations
@ -609,7 +621,7 @@ def test_minimize_pandas_adult():
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_german_credit_pandas():
@ -637,7 +649,8 @@ def test_german_credit_pandas():
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=qi)
categorical_features=categorical_features, features_to_minimize=qi,
encoder=preprocessor)
gen.fit(dataset=ArrayDataset(x_train, predictions))
transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations
@ -666,7 +679,7 @@ def test_german_credit_pandas():
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_regression(diabetes_dataset):
@ -726,7 +739,7 @@ def test_regression(diabetes_dataset):
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_x_y():
@ -766,7 +779,7 @@ def test_x_y():
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_x_y_features_names():
@ -806,7 +819,7 @@ def test_x_y_features_names():
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_BaseEstimator_classification(data_five_features):
@ -828,7 +841,8 @@ def test_BaseEstimator_classification(data_five_features):
# Now we have a full prediction pipeline.
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
categorical_features=categorical_features, features_to_minimize=QI,
encoder=preprocessor)
train_dataset = ArrayDataset(x, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(x))
@ -844,7 +858,7 @@ def test_BaseEstimator_classification(data_five_features):
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_BaseEstimator_regression(diabetes_dataset):
@ -903,7 +917,7 @@ def test_BaseEstimator_regression(diabetes_dataset):
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(transformed, predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_keras_model():
@ -936,7 +950,39 @@ def test_keras_model():
check_ncp(ncp, gener)
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
class PytorchModel(nn.Module):
def __init__(self, num_classes, num_features):
super(PytorchModel, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(num_features, 1024),
nn.Tanh(), )
self.fc2 = nn.Sequential(
nn.Linear(1024, 512),
nn.Tanh(), )
self.fc3 = nn.Sequential(
nn.Linear(512, 256),
nn.Tanh(), )
self.fc4 = nn.Sequential(
nn.Linear(256, 128),
nn.Tanh(),
)
self.classifier = nn.Linear(128, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.fc2(out)
out = self.fc3(out)
out = self.fc4(out)
return self.classifier(out)
def test_minimizer_pytorch(data_three_features):
@ -944,49 +990,17 @@ def test_minimizer_pytorch(data_three_features):
x = x.astype(np.float32)
qi = ['age', 'weight']
from torch import nn, optim
from apt.utils.datasets.datasets import PytorchData
from apt.utils.models.pytorch_model import PyTorchClassifier
class pytorch_model(nn.Module):
def __init__(self, num_classes, num_features):
super(pytorch_model, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(num_features, 1024),
nn.Tanh(), )
self.fc2 = nn.Sequential(
nn.Linear(1024, 512),
nn.Tanh(), )
self.fc3 = nn.Sequential(
nn.Linear(512, 256),
nn.Tanh(), )
self.fc4 = nn.Sequential(
nn.Linear(256, 128),
nn.Tanh(),
)
self.classifier = nn.Linear(128, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.fc2(out)
out = self.fc3(out)
out = self.fc4(out)
return self.classifier(out)
base_est = pytorch_model(2, 3)
base_est = PytorchModel(2, 3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_est.parameters(), lr=0.01)
model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
optimizer=optimizer, input_shape=(3,),
nb_classes=2)
model.fit(PytorchData(x.astype(np.float32), y), save_entire_model=False, nb_epochs=10)
model.fit(PytorchData(x, y), save_entire_model=False, nb_epochs=10)
ad = ArrayDataset(x)
predictions = model.predict(ad)
@ -1006,7 +1020,41 @@ def test_minimizer_pytorch(data_three_features):
check_ncp(ncp, expected_generalizations)
rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_minimizer_pytorch_iris():
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
(x_train, y_train), _ = get_iris_dataset_np()
x_train = x_train.astype(np.float32)
qi = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
from apt.utils.datasets.datasets import PytorchData
from apt.utils.models.pytorch_model import PyTorchClassifier
base_est = PytorchModel(3, 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_est.parameters(), lr=0.01)
model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
optimizer=optimizer, input_shape=(4,),
nb_classes=3)
model.fit(PytorchData(x_train, y_train), save_entire_model=False, nb_epochs=10)
predictions = model.predict(ArrayDataset(x_train))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.99
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
gener = gen.generalizations
check_features(features, gener, transformed, x_train)
ncp = gen.ncp.transform_score
check_ncp(ncp, gener)
rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_untouched():