mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
Support additional use cases for data (#46)
* Make ART black box classifier not apply preprocessing to data * Add option to store predictions (in addition to x,y) in Dataset and Data classes
This commit is contained in:
parent
e25e58b253
commit
00f9c16863
6 changed files with 139 additions and 62 deletions
|
|
@ -4,4 +4,4 @@ Implementation of datasets utility components for datasets creation, load, and s
|
|||
"""
|
||||
|
||||
from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, \
|
||||
OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE
|
||||
DatasetWithPredictions, OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ class Dataset(metaclass=ABCMeta):
|
|||
|
||||
:return: the data samples
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_labels(self) -> Collection[Any]:
|
||||
|
|
@ -47,7 +47,16 @@ class Dataset(metaclass=ABCMeta):
|
|||
|
||||
:return: the labels
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get predictions
|
||||
|
||||
:return: predictions as numpy array
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
|
|
@ -102,7 +111,7 @@ class StoredDataset(Dataset):
|
|||
:type path: string
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def load(self, **kwargs):
|
||||
|
|
@ -111,7 +120,7 @@ class StoredDataset(Dataset):
|
|||
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def download(url: str, dest_path: str, filename: str, unzip: Optional[bool] = False) -> None:
|
||||
|
|
@ -224,7 +233,7 @@ class ArrayDataset(Dataset):
|
|||
raise ValueError("The supplied features are not the same as in the data features")
|
||||
self.features_names = x.columns.to_list()
|
||||
|
||||
if y is not None and len(self._x) != len(self._y):
|
||||
if self._y is not None and len(self._x) != len(self._y):
|
||||
raise ValueError('Non equivalent lengths of x and y')
|
||||
|
||||
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
|
|
@ -243,6 +252,70 @@ class ArrayDataset(Dataset):
|
|||
"""
|
||||
return self._y
|
||||
|
||||
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get predictions
|
||||
|
||||
:return: predictions as numpy array
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class DatasetWithPredictions(Dataset):
|
||||
"""
|
||||
Dataset that is based on arrays (e.g., numpy/pandas/list...). Includes predictions from a model, and possibly also
|
||||
features and true labels.
|
||||
|
||||
:param x: collection of data samples
|
||||
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
|
||||
:param y: collection of labels
|
||||
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
|
||||
:param feature_names: The feature names, in the order that they appear in the data
|
||||
:type feature_names: list of strings, optional
|
||||
"""
|
||||
|
||||
def __init__(self, pred: INPUT_DATA_ARRAY_TYPE, x: Optional[INPUT_DATA_ARRAY_TYPE] = None,
|
||||
y: Optional[INPUT_DATA_ARRAY_TYPE] = None, features_names: Optional[list] = None, **kwargs):
|
||||
self.is_pandas = False
|
||||
self.features_names = features_names
|
||||
self._pred = self._array2numpy(pred)
|
||||
self._y = self._array2numpy(y) if y is not None else None
|
||||
self._x = self._array2numpy(x) if x is not None else None
|
||||
if self.is_pandas and x is not None:
|
||||
if features_names and not np.array_equal(features_names, x.columns):
|
||||
raise ValueError("The supplied features are not the same as in the data features")
|
||||
self.features_names = x.columns.to_list()
|
||||
|
||||
if self._y is not None and len(self._pred) != len(self._y):
|
||||
raise ValueError('Non equivalent lengths of pred and y')
|
||||
|
||||
if self._x is not None and len(self._x) != len(self._pred):
|
||||
raise ValueError('Non equivalent lengths of x and pred')
|
||||
|
||||
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get data samples
|
||||
|
||||
:return: data samples as numpy array
|
||||
"""
|
||||
return self._x
|
||||
|
||||
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get labels
|
||||
|
||||
:return: labels as numpy array
|
||||
"""
|
||||
return self._y
|
||||
|
||||
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get predictions
|
||||
|
||||
:return: predictions as numpy array
|
||||
"""
|
||||
return self._pred
|
||||
|
||||
|
||||
class PytorchData(Dataset):
|
||||
"""
|
||||
|
|
@ -284,6 +357,14 @@ class PytorchData(Dataset):
|
|||
"""
|
||||
return self._array2numpy(self._y) if self._y is not None else None
|
||||
|
||||
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get predictions
|
||||
|
||||
:return: predictions as numpy array
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_sample_item(self, idx: int) -> Tensor:
|
||||
"""
|
||||
Get the sample according to the given index
|
||||
|
|
|
|||
|
|
@ -270,7 +270,8 @@ class BlackboxClassifierPredictions(BlackboxClassifier):
|
|||
self._nb_classes = get_nb_classes(y_pred)
|
||||
self._input_shape = x_pred.shape[1:]
|
||||
predict_fn = (x_pred, y_pred)
|
||||
self._art_model = BlackBoxClassifier(predict_fn, self._input_shape, self._nb_classes, fuzzy_float_compare=True)
|
||||
self._art_model = BlackBoxClassifier(predict_fn, self._input_shape, self._nb_classes, fuzzy_float_compare=True,
|
||||
preprocessing=None)
|
||||
|
||||
|
||||
class BlackboxClassifierPredictFunction(BlackboxClassifier):
|
||||
|
|
@ -298,4 +299,4 @@ class BlackboxClassifierPredictFunction(BlackboxClassifier):
|
|||
super().__init__(model, output_type, black_box_access=True, unlimited_queries=unlimited_queries, **kwargs)
|
||||
self._nb_classes = nb_classes
|
||||
self._input_shape = input_shape
|
||||
self._art_model = BlackBoxClassifier(model, self._input_shape, self._nb_classes)
|
||||
self._art_model = BlackBoxClassifier(model, self._input_shape, self._nb_classes, preprocessing=None)
|
||||
|
|
|
|||
41
tests/test_datasets.py
Normal file
41
tests/test_datasets.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from apt.utils.datasets import Data, DatasetWithPredictions
|
||||
from apt.utils import dataset_utils
|
||||
|
||||
|
||||
def test_dataset_predictions():
|
||||
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
|
||||
pred = np.array([[0.23, 0.56, 0.21] for i in range(105)])
|
||||
|
||||
dataset = DatasetWithPredictions(pred)
|
||||
data = Data(train=dataset)
|
||||
|
||||
new_pred = data.get_train_set().get_predictions()
|
||||
|
||||
assert np.equal(pred, new_pred).all()
|
||||
|
||||
|
||||
def test_dataset_predictions_x():
|
||||
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
|
||||
pred = np.array([[0.23, 0.56, 0.21] for i in range(105)])
|
||||
|
||||
dataset = DatasetWithPredictions(pred, x=x_train)
|
||||
data = Data(train=dataset)
|
||||
|
||||
new_pred = data.get_train_set().get_predictions()
|
||||
|
||||
assert np.equal(pred, new_pred).all()
|
||||
|
||||
|
||||
def test_dataset_predictions_y():
|
||||
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
|
||||
pred = np.array([[0.23, 0.56, 0.21] for i in range(105)])
|
||||
|
||||
dataset = DatasetWithPredictions(pred, x=x_train, y=y_train)
|
||||
data = Data(train=dataset)
|
||||
|
||||
new_pred = data.get_train_set().get_predictions()
|
||||
|
||||
assert np.equal(pred, new_pred).all()
|
||||
|
|
@ -944,53 +944,6 @@ def test_keras_model():
|
|||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_blackbox_model():
|
||||
(X, y), (x_test, y_test) = get_iris_dataset_np()
|
||||
train_data = ArrayDataset(X, y)
|
||||
test_data = ArrayDataset(x_test, y_test)
|
||||
data = Data(train_data, test_data)
|
||||
|
||||
model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||
ad = ArrayDataset(x_test)
|
||||
predictions = model.predict(ad)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
||||
train_dataset = ArrayDataset(x_test, predictions)
|
||||
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ad)
|
||||
gener = gen.generalizations
|
||||
expected_generalizations = {'ranges': {'0': [], '1': [], '2': [4.849999904632568], '3': [0.7000000029802322]},
|
||||
'categories': {},
|
||||
'untouched': []}
|
||||
|
||||
for key in expected_generalizations['ranges']:
|
||||
assert_almost_equal(expected_generalizations['ranges'][key], gener['ranges'][key])
|
||||
for key in expected_generalizations['categories']:
|
||||
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
|
||||
|
||||
features = ['0', '1', '2', '3']
|
||||
modified_features = [f for f in features if
|
||||
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
|
||||
'ranges'].keys()]
|
||||
indexes = []
|
||||
for i in range(len(features)):
|
||||
if features[i] in modified_features:
|
||||
indexes.append(i)
|
||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_test, indexes, axis=1)).all())
|
||||
ncp = gen.ncp
|
||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_untouched():
|
||||
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}}, "label": 0,
|
||||
'categories': {'gender': ['male']}, "representative": {"age": 26, "height": 149}},
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ def test_blackbox_classifier():
|
|||
assert(pred.shape[0] == x_test.shape[0])
|
||||
|
||||
score = model.score(test)
|
||||
assert(0.0 <= score <= 1.0)
|
||||
assert(score == 1.0)
|
||||
|
||||
def test_blackbox_classifier_no_test():
|
||||
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
|
||||
|
|
@ -90,7 +90,7 @@ def test_blackbox_classifier_no_test():
|
|||
assert(pred.shape[0] == x_train.shape[0])
|
||||
|
||||
score = model.score(train)
|
||||
assert(0.0 <= score <= 1.0)
|
||||
assert (score == 1.0)
|
||||
|
||||
|
||||
def test_blackbox_classifier_no_train():
|
||||
|
|
@ -103,7 +103,7 @@ def test_blackbox_classifier_no_train():
|
|||
assert(pred.shape[0] == x_test.shape[0])
|
||||
|
||||
score = model.score(test)
|
||||
assert(0.0 <= score <= 1.0)
|
||||
assert (score == 1.0)
|
||||
|
||||
|
||||
def test_blackbox_classifier_no_test_y():
|
||||
|
|
@ -117,7 +117,7 @@ def test_blackbox_classifier_no_test_y():
|
|||
assert(pred.shape[0] == x_train.shape[0])
|
||||
|
||||
score = model.score(train)
|
||||
assert(0.0 <= score <= 1.0)
|
||||
assert (score == 1.0)
|
||||
|
||||
# since no test_y, BBC should use only test thus predict test should fail
|
||||
unable_to_predict_test = False
|
||||
|
|
@ -139,7 +139,7 @@ def test_blackbox_classifier_no_train_y():
|
|||
assert (pred.shape[0] == x_test.shape[0])
|
||||
|
||||
score = model.score(test)
|
||||
assert (0.0 <= score <= 1.0)
|
||||
assert (score == 1.0)
|
||||
|
||||
# since no train_y, BBC should use only test thus predict train should fail
|
||||
unable_to_predict_train = False
|
||||
|
|
@ -164,7 +164,7 @@ def test_blackbox_classifier_probabilities():
|
|||
assert (pred < 1.0).all()
|
||||
|
||||
score = model.score(train)
|
||||
assert (0.0 <= score <= 1.0)
|
||||
assert (score == 1.0)
|
||||
|
||||
|
||||
def test_blackbox_classifier_predict():
|
||||
|
|
@ -172,6 +172,7 @@ def test_blackbox_classifier_predict():
|
|||
return [0.23, 0.56, 0.21]
|
||||
|
||||
(x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
|
||||
y_train = np.array([[0.23, 0.56, 0.21] for i in range(105)])
|
||||
|
||||
train = ArrayDataset(x_train, y_train)
|
||||
|
||||
|
|
@ -182,7 +183,7 @@ def test_blackbox_classifier_predict():
|
|||
assert (pred < 1.0).all()
|
||||
|
||||
score = model.score(train)
|
||||
assert (0.0 <= score <= 1.0)
|
||||
assert (score == 1.0)
|
||||
|
||||
def test_is_one_hot():
|
||||
(_, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue