Merge with main

This commit is contained in:
abigailt 2022-08-01 18:11:34 +03:00
commit dc5cc793ee
30 changed files with 2819 additions and 1066 deletions

View file

@ -6,6 +6,17 @@ from os import path, mkdir
from six.moves.urllib.request import urlretrieve
def get_iris_dataset_np(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1). Default is 0.3
:type test_set: float
:return: Entire dataset and labels as numpy arrays. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
return _load_iris(test_set)
def _load_iris(test_set_size: float = 0.3):
iris = datasets.load_iris()
data = iris.data
@ -18,14 +29,15 @@ def _load_iris(test_set_size: float = 0.3):
return (x_train, y_train), (x_test, y_test)
def get_iris_dataset(test_set: float = 0.3):
def get_diabetes_dataset_np(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
Loads the Diabetes dataset from scikit-learn.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Entire dataset and labels as numpy array.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1). Default is 0.3
:type test_set: float
:return: Entire dataset and labels as numpy arrays. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
return _load_iris(test_set)
return _load_diabetes(test_set)
def _load_diabetes(test_set_size: float = 0.3):
@ -40,22 +52,14 @@ def _load_diabetes(test_set_size: float = 0.3):
return (x_train, y_train), (x_test, y_test)
def get_diabetes_dataset():
def get_german_credit_dataset_pd(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
Loads the UCI German credit dataset from `tests/datasets/german` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/ if necessary.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Entire dataset and labels as numpy array.
"""
return _load_diabetes()
def get_german_credit_dataset(test_set: float = 0.3):
"""
Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Dataset and labels as pandas dataframes.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1). Default is 0.3
:type test_set: float
:return: Dataset and labels as pandas dataframes. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
@ -118,15 +122,21 @@ def _modify_german_dataset(data):
return 1
else:
raise Exception('Bad value')
def modify_label(value):
return value - 1
data['Foreign_worker'] = data['Foreign_worker'].apply(modify_Foreign_worker)
data['Telephone'] = data['Telephone'].apply(modify_Telephone)
data['label'] = data['label'].apply(modify_label)
def get_adult_dataset():
def get_adult_dataset_pd():
"""
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary.
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ if necessary.
:return: Dataset and labels as pandas dataframes.
:return: Dataset and labels as pandas dataframes. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
@ -223,17 +233,22 @@ def _modify_adult_dataset(data):
return data.drop(['fnlwgt', 'education'], axis=1)
def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
def get_nursery_dataset_pd(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
"""
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it if necessary.
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/ if necessary.
:param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot
encoded and data is scaled using sklearn's StandardScaler.
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1.
:type raw: boolean
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1. Default is
0.2
:type test_set: float
:param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute
inference. This is done by assigning the original value 'problematic' the new value 1, and
the other original values are assigned the new value 0.
:return: Dataset and labels as pandas dataframes.
:type transform_social: boolean
:return: Dataset and labels as pandas dataframes. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data'
data_dir = '../datasets/nursery'

View file

@ -4,4 +4,4 @@ Implementation of datasets utility components for datasets creation, load, and s
"""
from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, \
OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE
DatasetWithPredictions, OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE

View file

@ -5,7 +5,7 @@ Implementation of utility classes for dataset handling
"""
from abc import ABCMeta, abstractmethod
from typing import Callable, Collection, Any, Union, List, Optional
from typing import Callable, Collection, Any, Union, List, Optional, Type
import tarfile
import os
@ -66,36 +66,68 @@ class Dataset(metaclass=ABCMeta):
@abstractmethod
def get_samples(self) -> Collection[Any]:
"""Return data samples"""
pass
"""
Return data samples
:return: the data samples
"""
raise NotImplementedError
@abstractmethod
def get_labels(self) -> Collection[Any]:
"""Return labels"""
pass
"""
Return labels
:return: the labels
"""
raise NotImplementedError
@abstractmethod
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions
:return: predictions as numpy array
"""
raise NotImplementedError
class StoredDataset(Dataset):
"""Abstract Class for Storable Dataset"""
"""Abstract Class for a Dataset that can be downloaded from a URL and stored in a file"""
@abstractmethod
def load_from_file(self, path: str):
"""Load dataset from file"""
pass
"""
Load dataset from file
:param path: the path to the file
:type path: string
:return: None
"""
raise NotImplementedError
@abstractmethod
def load(self, **kwargs):
"""Load dataset"""
pass
"""
Load dataset
:return: None
"""
raise NotImplementedError
@staticmethod
def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None:
def download(url: str, dest_path: str, filename: str, unzip: Optional[bool] = False) -> None:
"""
Download the dataset from URL
:param url: dataset URL, the dataset will be requested from this URL
:type url: string
:param dest_path: local dataset destination path
:type dest_path: string
:param filename: local dataset filename
:param unzip: flag whether or not perform extraction
:type filename: string
:param unzip: flag whether or not perform extraction. Default is False.
:type unzip: boolean, optional
:return: None
"""
file_path = os.path.join(dest_path, filename)
@ -113,12 +145,16 @@ class StoredDataset(Dataset):
StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
@staticmethod
def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
def extract_archive(zip_path: str, dest_path: Optional[str] = None, remove_archive: Optional[bool] = False):
"""
Extract dataset from archived file
:param zip_path: path to archived file
:type zip_path: string
:param dest_path: directory path to uncompress the file to
:param remove_archive: whether remove the archive file after uncompress (default False)
:type dest_path: string, optional
:param remove_archive: whether remove the archive file after uncompress. Default is False.
:type remove_archive: boolean, optional
:return: None
"""
logger.info("Extracting the dataset...")
@ -132,15 +168,23 @@ class StoredDataset(Dataset):
logger.info("Extracted the dataset")
@staticmethod
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None:
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle: Optional[bool] = True,
delimiter: Optional[str] = ",", fmt: Optional[Union[str, list]] = None) -> None:
"""
Split the data and take only a part of it
:param datafile: dataset file path
:type datafile: string
:param dest_datafile: destination path for the partial dataset file
:type dest_datafile: string
:param ratio: part of the dataset to save
:param shuffle: whether to shuffle the data or not (default True)
:param delimiter: dataset delimiter (default ",")
:param fmt: format for the correct data saving
:type ratio: int
:param shuffle: whether to shuffle the data or not. Default is True.
:type shuffle: boolean, optional
:param delimiter: dataset delimiter. Default is ","
:type delimiter: string, optional
:param fmt: format for the correct data saving. As defined by numpy.savetxt(). Default is None.
:type fmt: string or sequence of strings, optional
:return: None
"""
if os.path.isfile(dest_datafile):
@ -160,22 +204,19 @@ class StoredDataset(Dataset):
class ArrayDataset(Dataset):
"""Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
"""
Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)
def __init__(
self,
x: INPUT_DATA_ARRAY_TYPE,
y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
features_names: Optional = None,
**kwargs,
):
"""
ArrayDataset constructor.
:param x: collection of data samples
:param y: collection of labels (optional)
:param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
:param kwargs: dataset parameters
"""
:param x: collection of data samples
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
:param y: collection of labels
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
:param feature_names: The feature names, in the order that they appear in the data
:type feature_names: list of strings, optional
"""
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
features_names: Optional[list] = None, **kwargs):
self.is_pandas = self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series
self.features_names = features_names
@ -187,26 +228,100 @@ class ArrayDataset(Dataset):
raise ValueError("The supplied features are not the same as in the data features")
self.features_names = x.columns.to_list()
if y is not None and len(self._x) != len(self._y):
if self._y is not None and len(self._x) != len(self._y):
raise ValueError("Non equivalent lengths of x and y")
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return data samples as numpy array"""
"""
Get data samples
:return: data samples as numpy array
"""
return self._x
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return labels as numpy array"""
"""
Get labels
:return: labels as numpy array
"""
return self._y
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions
:return: predictions as numpy array
"""
return None
class DatasetWithPredictions(Dataset):
"""
Dataset that is based on arrays (e.g., numpy/pandas/list...). Includes predictions from a model, and possibly also
features and true labels.
:param x: collection of data samples
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
:param y: collection of labels
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
:param feature_names: The feature names, in the order that they appear in the data
:type feature_names: list of strings, optional
"""
def __init__(self, pred: INPUT_DATA_ARRAY_TYPE, x: Optional[INPUT_DATA_ARRAY_TYPE] = None,
y: Optional[INPUT_DATA_ARRAY_TYPE] = None, features_names: Optional[list] = None, **kwargs):
self.is_pandas = False
self.features_names = features_names
self._pred = self._array2numpy(pred)
self._y = self._array2numpy(y) if y is not None else None
self._x = self._array2numpy(x) if x is not None else None
if self.is_pandas and x is not None:
if features_names and not np.array_equal(features_names, x.columns):
raise ValueError("The supplied features are not the same as in the data features")
self.features_names = x.columns.to_list()
if self._y is not None and len(self._pred) != len(self._y):
raise ValueError('Non equivalent lengths of pred and y')
if self._x is not None and len(self._x) != len(self._pred):
raise ValueError('Non equivalent lengths of x and pred')
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get data samples
:return: data samples as numpy array
"""
return self._x
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get labels
:return: labels as numpy array
"""
return self._y
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions
:return: predictions as numpy array
"""
return self._pred
class PytorchData(Dataset):
"""
Dataset for pytorch models.
:param x: collection of data samples
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
:param y: collection of labels
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
"""
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
"""
PytorchData constructor.
:param x: collection of data samples
:param y: collection of labels (optional)
:param kwargs: dataset parameters
"""
self._y = array2torch_tensor(y) if y is not None else None
self._x = array2torch_tensor(x)
@ -215,7 +330,7 @@ class PytorchData(Dataset):
if self.is_pandas:
self.features_names = x.columns
if y is not None and len(self._x) != len(self._y):
if self._y is not None and len(self._x) != len(self._y):
raise ValueError("Non equivalent lengths of x and y")
if self._y is not None:
@ -224,17 +339,47 @@ class PytorchData(Dataset):
self.__getitem__ = self.get_sample_item
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return data samples as numpy array"""
"""
Get data samples.
:return: samples as numpy array
"""
return array2numpy(self._x)
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return labels as numpy array"""
"""
Get labels.
:return: labels as numpy array
"""
return array2numpy(self._y) if self._y is not None else None
def get_sample_item(self, idx) -> Tensor:
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions
:return: predictions as numpy array
"""
return None
def get_sample_item(self, idx: int) -> Tensor:
"""
Get the sample according to the given index
:param idx: the index of the sample to return
:type idx: int
:return: the sample as a pytorch Tensor
"""
return self._x[idx]
def get_item(self, idx) -> Tensor:
def get_item(self, idx: int) -> Tensor:
"""
Get the sample and label according to the given index
:param idx: the index of the sample to return
:type idx: int
:return: the sample and label as pytorch Tensors. Returned as a tuple (sample, label)
"""
sample, label = self._x[idx], self._y[idx]
return sample, label
@ -251,11 +396,13 @@ class DatasetFactory:
def register(cls, name: str) -> Callable:
"""
Class method to register Dataset to the internal registry
:param name: dataset name
:return:
:type name: string
:return: a Callable that returns the registered dataset class
"""
def inner_wrapper(wrapped_class: Dataset) -> Any:
def inner_wrapper(wrapped_class: Type[Dataset]) -> Any:
if name in cls.registry:
logger.warning("Dataset %s already exists. Will replace it", name)
cls.registry[name] = wrapped_class
@ -267,11 +414,15 @@ class DatasetFactory:
def create_dataset(cls, name: str, **kwargs) -> Dataset:
"""
Factory command to create dataset instance.
This method gets the appropriate Dataset class from the registry
and creates an instance of it, while passing in the parameters
given in ``kwargs``.
:param name: The name of the dataset to create.
:type name: string
:param kwargs: dataset parameters
:type kwargs: keyword arguments as expected by the class
:return: An instance of the dataset that is created.
"""
if name not in cls.registry:
@ -285,13 +436,19 @@ class DatasetFactory:
class Data:
def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
"""
Class for storing train and test datasets.
:param train: the training set
:type train: `Dataset`
:param test: the test set
:type test: `Dataset`, optional
"""
def __init__(self, train: Dataset = None, test: Optional[Dataset] = None, **kwargs):
"""
Data class constructor.
The class stores train and test datasets.
If neither of the datasets was provided,
Both train and test datasets will be create using
DatasetFactory to create a dataset instance
If neither of the datasets was provided, both train and test datasets will be created using `DatasetFactory`.
"""
if train or test:
self.train = train
@ -301,25 +458,77 @@ class Data:
self.test = DatasetFactory.create_dataset(train=False, **kwargs)
def get_train_set(self) -> Dataset:
"""Return train DatasetBase"""
"""
Get training set
:return: training 'Dataset`
"""
return self.train
def get_test_set(self) -> Dataset:
"""Return test DatasetBase"""
"""
Get test set
:return: test 'Dataset`
"""
return self.test
def get_train_samples(self) -> Collection[Any]:
"""Return train set samples"""
"""
Get train set samples, or None if no training data provided
:return: training samples
"""
if self.train is None:
return None
return self.train.get_samples()
def get_train_labels(self) -> Collection[Any]:
"""Return train set labels"""
"""
Get train set labels, or None if no training labels provided
:return: training labels
"""
if self.train is None:
return None
return self.train.get_labels()
def get_train_predictions(self) -> Collection[Any]:
"""
Get train set predictions, or None if no training predictions provided
:return: training labels
"""
if self.train is None:
return None
return self.train.get_predictions()
def get_test_samples(self) -> Collection[Any]:
"""Return test set samples"""
"""
Get test set samples
:return: test samples, or None if no test data provided
"""
if self.test is None:
return None
return self.test.get_samples()
def get_test_labels(self) -> Collection[Any]:
"""Return test set labels"""
"""
Get test set labels
:return: test labels, or None if no test labels provided
"""
if self.test is None:
return None
return self.test.get_labels()
def get_test_predictions(self) -> Collection[Any]:
"""
Get test set predictions, or None if no test predictions provided
:return: test labels
"""
if self.test is None:
return None
return self.test.get_predictions()

View file

@ -1,2 +1,6 @@
from apt.utils.models.model import Model, ModelOutputType
from apt.utils.models.model import Model, BlackboxClassifier, ModelOutputType, ScoringMethod, \
BlackboxClassifierPredictions, BlackboxClassifierPredictFunction, get_nb_classes, is_one_hot, \
check_correct_model_output
from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor
from apt.utils.models.keras_model import KerasClassifier, KerasRegressor
from apt.utils.models.xgboost_model import XGBoostClassifier

View file

@ -0,0 +1,151 @@
from typing import Optional
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow import keras
tf.compat.v1.disable_eager_execution()
from sklearn.metrics import mean_squared_error
from apt.utils.models import Model, ModelOutputType, ScoringMethod, check_correct_model_output
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from art.utils import check_and_transform_label_format
from art.estimators.classification.keras import KerasClassifier as ArtKerasClassifier
from art.estimators.regression.keras import KerasRegressor as ArtKerasRegressor
class KerasModel(Model):
"""
Wrapper class for keras models.
"""
class KerasClassifier(KerasModel):
"""
Wrapper class for keras classification models.
:param model: The original keras model object.
:type model: `keras.models.Model`
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: keras.models.Model, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
logits = False
if output_type == ModelOutputType.CLASSIFIER_LOGITS:
logits = True
self._art_model = ArtKerasClassifier(model, use_logits=logits)
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
labels (consecutive integers starting at 0).
:type train_data: `Dataset`
:return: None
"""
y_encoded = check_and_transform_label_format(train_data.get_labels(), self._art_model.nb_classes)
self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array (class probabilities, if supported).
"""
predictions = self._art_model.predict(x.get_samples(), **kwargs)
check_correct_model_output(predictions, self.output_type)
return predictions
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:param scoring_method: The method for scoring predictions. Default is ACCURACY.
:type scoring_method: `ScoringMethod`, optional
:return: the score as float (between 0 and 1)
"""
y = check_and_transform_label_format(test_data.get_labels(), self._art_model.nb_classes)
predicted = self.predict(test_data)
if scoring_method == ScoringMethod.ACCURACY:
return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
else:
raise NotImplementedError
class KerasRegressor(KerasModel):
"""
Wrapper class for keras regression models.
:param model: The original keras model object.
:type model: `keras.models.Model`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: keras.models.Model, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
self._art_model = ArtKerasRegressor(model)
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data.
:type train_data: `Dataset`
:return: None
"""
self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
return self._art_model.predict(x.get_samples(), **kwargs)
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.MEAN_SQUARED_ERROR,
**kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:param scoring_method: The method for scoring predictions. Default is ACCURACY.
:type scoring_method: `ScoringMethod`, optional
:return: the score as float
"""
predicted = self.predict(test_data)
if scoring_method == ScoringMethod.MEAN_SQUARED_ERROR:
return mean_squared_error(test_data.get_labels(), predicted)
else:
raise NotImplementedError('Only MEAN_SQUARED_ERROR supported as scoring method')

View file

@ -1,37 +1,91 @@
from abc import ABCMeta, abstractmethod
from typing import Any, Optional
from typing import Any, Optional, Callable, Tuple, Union
from enum import Enum, auto
import numpy as np
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from apt.utils.datasets import Dataset, Data, OUTPUT_DATA_ARRAY_TYPE
from art.estimators.classification import BlackBoxClassifier
from art.utils import check_and_transform_label_format
class ModelOutputType(Enum):
CLASSIFIER_VECTOR = auto() # probabilities or logits
CLASSIFIER_PROBABILITIES = auto() # vector of probabilities
CLASSIFIER_LOGITS = auto() # vector of logits
CLASSIFIER_SCALAR = auto() # label only
REGRESSOR_SCALAR = auto() # value
class ModelType(Enum):
SKLEARN_DECISION_TREE = auto()
SKLEARN_GRADIENT_BOOSTING = auto()
class ScoringMethod(Enum):
ACCURACY = auto() # number of correct predictions divided by the number of samples
MEAN_SQUARED_ERROR = auto() # mean squared error between the predictions and true labels
def is_one_hot(y: OUTPUT_DATA_ARRAY_TYPE) -> bool:
return len(y.shape) == 2 and y.shape[1] > 1
def get_nb_classes(y: OUTPUT_DATA_ARRAY_TYPE) -> int:
"""
Get the number of classes from an array of labels
:param y: The labels
:type y: numpy array
:return: The number of classes as integer
"""
if y is None:
return 0
if type(y) != np.ndarray:
raise ValueError("Input should be numpy array")
if is_one_hot(y):
return y.shape[1]
else:
return int(np.max(y) + 1)
def check_correct_model_output(y: OUTPUT_DATA_ARRAY_TYPE, output_type: ModelOutputType):
"""
Checks whether there is a mismatch between the declared model output type and its actual output.
:param y: Model output
:type y: numpy array
:param output_type: Declared output type (provided at init)
:type output_type: ModelOutputType
:raises: ValueError (in case of mismatch)
"""
if not is_one_hot(y): # 1D array
if output_type == ModelOutputType.CLASSIFIER_PROBABILITIES or output_type == ModelOutputType.CLASSIFIER_LOGITS:
raise ValueError("Incompatible model output types. Model outputs 1D array of categorical scalars while "
"output type is set to ", output_type)
class Model(metaclass=ABCMeta):
"""
Abstract base class for ML model wrappers.
:param model: The original model object (of the underlying ML framework)
:type model: framework-specific model object
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `Model` wrapper object.
:param model: The original model object (of the underlying ML framework)
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
self._model = model
self._output_type = output_type
self._black_box_access = black_box_access
@ -53,8 +107,8 @@ class Model(metaclass=ABCMeta):
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:return: Predictions from the model.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
raise NotImplementedError
@ -65,13 +119,14 @@ class Model(metaclass=ABCMeta):
:param test_data: Test data.
:type train_data: `Dataset`
:return: the score as float (for classifiers, between 0 and 1)
"""
return NotImplementedError
@property
def model(self) -> Any:
"""
Return the model.
Return the underlying model.
:return: The model.
"""
@ -89,21 +144,223 @@ class Model(metaclass=ABCMeta):
@property
def black_box_access(self) -> bool:
"""
Return True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals are also available.
Return whether the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, or if the model internals are also available.
:return: True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals are also available.
:return: True if the model is only available via query (API) access, otherwise False.
"""
return self._black_box_access
@property
def unlimited_queries(self) -> bool:
"""
If black_box_access is True, Return whether a user can perform unlimited queries to the model API
If black_box_access is True, return whether a user can perform unlimited queries to the model API
or whether there is a limit to the number of queries that can be submitted.
:return: If black_box_access is True, Return whether a user can perform unlimited queries to the model API
or whether there is a limit to the number of queries that can be submitted.
:return: True if a user can perform unlimited queries to the model API, otherwise False.
"""
return self._unlimited_queries
class BlackboxClassifier(Model):
"""
Wrapper for black-box ML classification models.
:param model: The training and/or test data along with the model's predictions for the data or a callable predict
method.
:type model: `Data` object or Callable
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Always assumed to be True (black box) for this wrapper.
:type black_box_access: boolean, optional
:param unlimited_queries: Boolean indicating whether a user can perform unlimited queries to the model API.
:type unlimited_queries: boolean, optional
:param model_type: The type of model this BlackboxClassifier represents. Needed in order to build and/or fit
similar dummy/shadow models.
:type model_type: Either a (unfitted) model object of the underlying framework, or a ModelType representing the
type of the model, optional.
"""
def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, model_type: Optional[Union[Any, ModelType]] = None,
**kwargs):
super().__init__(model, output_type, black_box_access=True, unlimited_queries=unlimited_queries, **kwargs)
self._nb_classes = None
self._input_shape = None
self._model_type = model_type
@property
def nb_classes(self) -> int:
"""
Return the number of prediction classes of the model.
:return: Number of prediction classes of the model.
"""
return self._nb_classes
@property
def input_shape(self) -> Tuple[int, ...]:
"""
Return the shape of input to the model.
:return: Shape of input to the model.
"""
return self._input_shape
@property
def model_type(self) -> Optional[Union[Any, ModelType]]:
"""
Return the type of the model.
:return: Either a (unfitted) model object of the underlying framework, or a ModelType representing the type of
the model, or None (of none provided at init).
"""
return self._model_type
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
A blackbox model cannot be fit.
"""
raise NotImplementedError
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions from the model for input `x`. `x` must be a subset of the data provided in the `model` data in
`__init__()`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
predictions = self._art_model.predict(x.get_samples())
check_correct_model_output(predictions, self.output_type)
return predictions
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:param scoring_method: The method for scoring predictions. Default is ACCURACY.
:type scoring_method: `ScoringMethod`, optional
:return: the score as float (for classifiers, between 0 and 1)
"""
if test_data.get_samples() is None or test_data.get_labels() is None:
raise ValueError('score can only be computed when test data and labels are available')
predicted = self._art_model.predict(test_data.get_samples())
y = check_and_transform_label_format(test_data.get_labels(), nb_classes=self._nb_classes)
if scoring_method == ScoringMethod.ACCURACY:
return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
else:
raise NotImplementedError
class BlackboxClassifierPredictions(BlackboxClassifier):
"""
Wrapper for black-box ML classification models using data and predictions.
:param model: The training and/or test data along with the model's predictions for the data. Assumes that the data
is represented as numpy arrays. Labels are expected to either be class probabilities (multi-column) or
a 1D-array of categorical labels (consecutive integers starting at 0).
:type model: `Data` object
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Always assumed to be True for this wrapper.
:type black_box_access: boolean, optional
:param unlimited_queries: Boolean indicating whether a user can perform unlimited queries to the model API.
Always assumed to be False for this wrapper.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: Data, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access=True, unlimited_queries=False, **kwargs)
x_train_pred = model.get_train_samples()
y_train_pred = model.get_train_predictions()
if y_train_pred is None:
y_train_pred = model.get_train_labels()
x_test_pred = model.get_test_samples()
y_test_pred = model.get_test_predictions()
if y_test_pred is None:
y_test_pred = model.get_test_labels()
if y_train_pred is not None:
check_correct_model_output(y_train_pred, self.output_type)
if y_test_pred is not None:
check_correct_model_output(y_test_pred, self.output_type)
if y_train_pred is not None and len(y_train_pred.shape) == 1:
self._nb_classes = get_nb_classes(y_train_pred)
y_train_pred = check_and_transform_label_format(y_train_pred, nb_classes=self._nb_classes)
if y_test_pred is not None and len(y_test_pred.shape) == 1:
if self._nb_classes is None:
self._nb_classes = get_nb_classes(y_test_pred)
y_test_pred = check_and_transform_label_format(y_test_pred, nb_classes=self._nb_classes)
if x_train_pred is not None and y_train_pred is not None and x_test_pred is not None and y_test_pred is not None:
if type(y_train_pred) != np.ndarray or type(y_test_pred) != np.ndarray \
or type(y_train_pred) != np.ndarray or type(y_test_pred) != np.ndarray:
raise NotImplementedError("X/Y Data should be numpy array")
x_pred = np.vstack((x_train_pred, x_test_pred))
y_pred = np.vstack((y_train_pred, y_test_pred))
elif x_test_pred is not None and y_test_pred is not None:
x_pred = x_test_pred
y_pred = y_test_pred
elif x_train_pred is not None and y_train_pred is not None:
x_pred = x_train_pred
y_pred = y_train_pred
else:
raise NotImplementedError("Invalid data - None")
self._nb_classes = get_nb_classes(y_pred)
self._input_shape = x_pred.shape[1:]
self._x_pred = x_pred
self._y_pred = y_pred
predict_fn = (x_pred, y_pred)
self._art_model = BlackBoxClassifier(predict_fn, self._input_shape, self._nb_classes, fuzzy_float_compare=True,
preprocessing=None)
def get_predictions(self) -> Tuple[OUTPUT_DATA_ARRAY_TYPE, OUTPUT_DATA_ARRAY_TYPE]:
"""
Return all the data for which the model contains predictions.
:return: Tuple containing data and predictions as numpy arrays.
"""
return self._x_pred, self._y_pred
class BlackboxClassifierPredictFunction(BlackboxClassifier):
"""
Wrapper for black-box ML classification models using a predict function.
:param model: Function that takes in an `np.ndarray` of input data and returns predictions either as class
probabilities (multi-column) or a 1D-array of categorical labels (consecutive integers starting at 0).
:type model: Callable
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param input_shape: Shape of input to the model.
:type input_shape: Tuple[int, ...]
:param nb_classes: Number of prediction classes of the model.
:type nb_classes: int
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Always assumed to be True for this wrapper.
:type black_box_access: boolean, optional
:param unlimited_queries: Boolean indicating whether a user can perform unlimited queries to the model API.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: Callable, output_type: ModelOutputType, input_shape: Tuple[int, ...], nb_classes: int,
black_box_access: Optional[bool] = True, unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access=True, unlimited_queries=unlimited_queries, **kwargs)
self._nb_classes = nb_classes
self._input_shape = input_shape
def predict_wrapper(x):
predictions = self.model(x)
if not is_one_hot(predictions):
predictions = check_and_transform_label_format(predictions, nb_classes=nb_classes, return_one_hot=True)
return predictions
self._art_model = BlackBoxClassifier(predict_wrapper, self._input_shape, self._nb_classes, preprocessing=None)

View file

@ -1,15 +1,14 @@
from typing import Optional
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
from apt.utils.models import Model, ModelOutputType
from apt.utils.models import Model, ModelOutputType, get_nb_classes, check_correct_model_output
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
from art.estimators.regression.scikitlearn import ScikitlearnRegressor
from art.utils import check_and_transform_label_format
class SklearnModel(Model):
@ -22,6 +21,7 @@ class SklearnModel(Model):
:param test_data: Test data.
:type train_data: `Dataset`
:return: the score as float (for classifiers, between 0 and 1)
"""
return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs)
@ -29,23 +29,23 @@ class SklearnModel(Model):
class SklearnClassifier(SklearnModel):
"""
Wrapper class for scikitlearn classification models.
:param model: The original sklearn model object.
:type model: scikitlearn classifier object
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `SklearnClassifier` wrapper object.
:param model: The original sklearn model object.
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
self._art_model = ArtSklearnClassifier(model)
@ -53,11 +53,14 @@ class SklearnClassifier(SklearnModel):
"""
Fit the model using the training data.
:param train_data: Training data.
:param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
labels (consecutive integers starting at 0).
:type train_data: `Dataset`
:return: None
"""
encoder = OneHotEncoder(sparse=False)
y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
y = train_data.get_labels()
self.nb_classes = get_nb_classes(y)
y_encoded = check_and_transform_label_format(y, nb_classes=self.nb_classes)
self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
@ -65,30 +68,32 @@ class SklearnClassifier(SklearnModel):
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:return: Predictions from the model (class probabilities, if supported).
:type x: `Dataset`
:return: Predictions from the model as numpy array (class probabilities, if supported).
"""
return self._art_model.predict(x, **kwargs)
predictions = self._art_model.predict(x.get_samples(), **kwargs)
check_correct_model_output(predictions, self.output_type)
return predictions
class SklearnRegressor(SklearnModel):
"""
Wrapper class for scikitlearn regression models.
:param model: The original sklearn model object.
:type model: scikitlearn regressor object
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: BaseEstimator, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `SklearnRegressor` wrapper object.
:param model: The original sklearn model object.
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
self._art_model = ScikitlearnRegressor(model)
@ -98,6 +103,7 @@ class SklearnRegressor(SklearnModel):
:param train_data: Training data.
:type train_data: `Dataset`
:return: None
"""
self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
@ -106,7 +112,7 @@ class SklearnRegressor(SklearnModel):
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:return: Predictions from the model.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
return self._art_model.predict(x, **kwargs)
return self._art_model.predict(x.get_samples(), **kwargs)

View file

@ -0,0 +1,87 @@
from typing import Optional, Tuple
from apt.utils.models import Model, ModelOutputType, ScoringMethod, check_correct_model_output, is_one_hot
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from xgboost import XGBClassifier
import numpy as np
from art.estimators.classification.xgboost import XGBoostClassifier as ArtXGBoostClassifier
class XGBoostModel(Model):
"""
Wrapper class for xgboost models.
"""
class XGBoostClassifier(XGBoostModel):
"""
Wrapper class for xgboost classification models.
:param model: The original xgboost model object. Must be fit.
:type model: Booster or XGBClassifier object
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param input_shape: Shape of input to the model.
:type input_shape: Tuple[int, ...]
:param nb_classes: Number of prediction classes of the model.
:type nb_classes: int
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: XGBClassifier, output_type: ModelOutputType, input_shape: Tuple[int, ...],
nb_classes: int,black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
self._art_model = ArtXGBoostClassifier(model, nb_features=input_shape[0], nb_classes=nb_classes)
self.nb_classes = nb_classes
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
labels (consecutive integers starting at 0).
:type train_data: `Dataset`
:return: None
"""
self._art_model._model.fit(train_data.get_samples(), train_data.get_labels())
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array (class probabilities, if supported).
"""
predictions = self._art_model.predict(x.get_samples(), **kwargs)
check_correct_model_output(predictions, self.output_type)
return predictions
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:return: the score as float (for classifiers, between 0 and 1)
"""
y = test_data.get_labels()
predicted = self.predict(test_data)
if is_one_hot(predicted):
predicted = np.argmax(predicted, axis=1)
if is_one_hot(y):
y = np.argmax(y, axis=1)
if scoring_method == ScoringMethod.ACCURACY:
return np.count_nonzero(y == predicted) / predicted.shape[0]
else:
raise NotImplementedError