diff --git a/apt/utils/datasets/__init__.py b/apt/utils/datasets/__init__.py index d80fd72..610dc46 100644 --- a/apt/utils/datasets/__init__.py +++ b/apt/utils/datasets/__init__.py @@ -3,4 +3,4 @@ The AI Privacy Toolbox (datasets). Implementation of datasets utility components for datasets creation, load, and store """ -from apt.utils.datasets.datasets import DatasetABC, StoredDatasetABC, DatasetFactory, Data, BaseDataset +from apt.utils.datasets.datasets import DatasetABC, StoredDatasetABC, DatasetFactory, Data, BaseDataset, DATA_ARRAY_TYPE diff --git a/apt/utils/datasets/datasets.py b/apt/utils/datasets/datasets.py index c9fa0a5..f6dee7a 100644 --- a/apt/utils/datasets/datasets.py +++ b/apt/utils/datasets/datasets.py @@ -5,17 +5,21 @@ Implementation of utility classes for dataset handling """ from abc import ABCMeta, abstractmethod -from typing import Callable, Collection, Any +from typing import Callable, Collection, Any, Union import tarfile import os import urllib.request import numpy as np +import pandas as pd import logging logger = logging.getLogger(__name__) +DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame] + + class DatasetABC(metaclass=ABCMeta): """Base Abstract Class for Dataset""" @@ -122,7 +126,7 @@ class StoredDatasetABC(DatasetABC): class BaseDataset(DatasetABC): """Base Class for Dataset""" - def __init__(self, x, y, **kwargs): + def __init__(self, x: DATA_ARRAY_TYPE, y: DATA_ARRAY_TYPE, **kwargs): """ BaseDataset constructor. :param x: collection of data samples @@ -135,11 +139,11 @@ class BaseDataset(DatasetABC): if len(self.x) != len(self.y): raise ValueError('Non equivalent lengths of x and y') - def get_samples(self) -> Collection[Any]: + def get_samples(self) -> DATA_ARRAY_TYPE: """Return data samples""" return self.x - def get_labels(self) -> Collection[Any]: + def get_labels(self) -> DATA_ARRAY_TYPE: """Return labels""" return self.y @@ -192,7 +196,7 @@ class Data: The class stores train and test datasets. If neither of the datasets was provided, Both train and test datasets will be create using - Factory command to create dataset instance + DatasetFactory to create a dataset instance """ if train or test: self.train = train @@ -209,18 +213,18 @@ class Data: """Return test DatasetBase""" return self.test - def get_train_samples(self): + def get_train_samples(self) -> Collection[Any]: """Return train set samples""" return self.train.get_samples() - def get_train_labels(self): + def get_train_labels(self) -> Collection[Any]: """Return train set labels""" return self.train.get_labels() - def get_test_samples(self): + def get_test_samples(self) -> Collection[Any]: """Return test set samples""" return self.test.get_samples() - def get_test_labels(self): + def get_test_labels(self) -> Collection[Any]: """Return test set labels""" - return self.test.get_labels() \ No newline at end of file + return self.test.get_labels() diff --git a/apt/utils/models/model.py b/apt/utils/models/model.py index f1fdce6..d025a28 100644 --- a/apt/utils/models/model.py +++ b/apt/utils/models/model.py @@ -1,34 +1,34 @@ -from abc import ABC, abstractmethod -from typing import Union, List, Any, Optional -import numpy as np +from abc import ABCMeta, abstractmethod +from typing import Any -class Model(ABC): +from apt.utils.datasets import BaseDataset, DATA_ARRAY_TYPE + + +class Model(metaclass=ABCMeta): """ - Base class for ML model wrappers. + Abstract base class for ML model wrappers. """ def __init__(self, model: Any, **kwargs): """ - Initialize a `Model` wrapper object. + Initialize a `Model` wrapper object. - :param model: The original model object (of the underlying ML framework) + :param model: The original model object (of the underlying ML framework) """ self._model = model @abstractmethod - def fit(self, x: np.ndarray, y: np.ndarray, **kwargs) -> None: + def fit(self, train_data: BaseDataset, **kwargs) -> None: """ - Fit the model using the training data `(x, y)`. + Fit the model using the training data. - :param x: Training data. - :type x: `np.ndarray` or `pandas.DataFrame` - :param y: True labels. - :type y: `np.ndarray` or `pandas.DataFrame` + :param train_data: Training data. + :type train_data: `BaseDataset` """ raise NotImplementedError @abstractmethod - def predict(self, x: np.ndarray, **kwargs) -> np.ndarray: + def predict(self, x: DATA_ARRAY_TYPE, **kwargs) -> DATA_ARRAY_TYPE: """ Perform predictions using the model for input `x`. @@ -46,67 +46,3 @@ class Model(ABC): :return: The model. """ return self._model - - -class SingleOutputModel(Model): - """ - Wrapper class for ML models whose output is a single value (e.g., classification with label only output, regression). - """ - - -class MultipleOutputModel(Model): - """ - Wrapper class for ML models whose output is a vector (e.g., class probabilities or logits). - """ - - -class ModelWithLoss(Model): - """ - Wrapper class for ML models that support computing loss values for predictions. - """ - - def __init__(self, model: Any, loss: Optional[Any] = None, **kwargs): - """ - Initialize a `ModelWithLoss` wrapper object. - - :param model: The original model object (of the underlying ML framework) - :param loss: The loss function/object of the model (of the underlying ML framework) - """ - super().__init__(model, **kwargs) - self._loss = loss - - - # Probably not needed for now, as we will not be using these wrappers directly in ART. - # @abstractmethod - # def loss(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray: - # """ - # Compute the loss of the model for samples `x`. - # - # :param x: Input samples. - # :type x: `np.ndarray` or `pandas.DataFrame` - # :param y: True labels. - # :type y: `np.ndarray` or `pandas.DataFrame` - # :return: Loss values. - # """ - # raise NotImplementedError - - -# Probably not needed for now, as we will not be using these wrappers directly in ART. -# class ModelWithGradients(Model): -# """ -# Wrapper class for ML models that support computing gradients. -# """ -# @abstractmethod -# def class_gradient(self, x: np.ndarray, label: Union[int, List[int], None] = None, **kwargs) -> np.ndarray: -# """ -# Compute per-class derivatives w.r.t. input `x`. -# -# :param x: Input samples. -# :type x: `np.ndarray` or `pandas.DataFrame` -# :param label: Index of a specific class. If provided, the gradient of the specified class -# is computed for all samples. Otherwise, gradients for all classes are computed for all samples. -# :param label: int -# :return: Gradients of input features w.r.t. each class in the form `(batch_size, nb_classes, input_shape)` when -# computing for all classes, or `(batch_size, 1, input_shape)` when `label` is specified. -# """ -# raise NotImplementedError diff --git a/apt/utils/models/sklearn_model.py b/apt/utils/models/sklearn_model.py index dffa356..92e8ba0 100644 --- a/apt/utils/models/sklearn_model.py +++ b/apt/utils/models/sklearn_model.py @@ -1,7 +1,10 @@ import numpy as np -from sklearn.preprocessing import OneHotEncoder -from apt.utils.models import Model, ModelWithLoss, SingleOutputModel +from sklearn.preprocessing import OneHotEncoder +from sklearn.base import BaseEstimator + +from apt.utils.models import Model +from apt.utils.datasets import BaseDataset, DATA_ARRAY_TYPE from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier from art.estimators.regression.scikitlearn import ScikitlearnRegressor @@ -11,23 +14,21 @@ class SklearnModel(Model): """ Wrapper class for scikitlearn models. """ - def score(self, x: np.ndarray, y: np.ndarray, **kwargs): + def score(self, test_data: BaseDataset, **kwargs): """ - Score the model using test data `(x, y)`. + Score the model using test data. - :param x: Test data. - :type x: `np.ndarray` or `pandas.DataFrame` - :param y: True labels. - :type y: `np.ndarray` or `pandas.DataFrame` + :param test_data: Test data. + :type train_data: `BaseDataset` """ - return self.model.score(x, y, **kwargs) + return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs) class SklearnClassifier(SklearnModel): """ Wrapper class for scikitlearn classification models. """ - def __init__(self, model, **kwargs): + def __init__(self, model: BaseEstimator, **kwargs): """ Initialize a `SklearnClassifier` wrapper object. @@ -36,35 +37,33 @@ class SklearnClassifier(SklearnModel): super().__init__(model, **kwargs) self._art_model = ArtSklearnClassifier(model) - def fit(self, x: np.ndarray, y: np.ndarray, **kwargs) -> None: + def fit(self, train_data: BaseDataset, **kwargs) -> None: """ - Fit the model using the training data `(x, y)`. + Fit the model using the training data. - :param x: Training data. - :type x: `np.ndarray` or `pandas.DataFrame` - :param y: True labels. - :type y: `np.ndarray` or `pandas.DataFrame` + :param train_data: Training data. + :type train_data: `BaseDataset` """ encoder = OneHotEncoder(sparse=False) - y_encoded = encoder.fit_transform(y.reshape(-1, 1)) - self._art_model.fit(x, y_encoded, **kwargs) + y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1)) + self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs) - def predict(self, x: np.ndarray, **kwargs) -> np.ndarray: + def predict(self, x: DATA_ARRAY_TYPE, **kwargs) -> DATA_ARRAY_TYPE: """ Perform predictions using the model for input `x`. :param x: Input samples. :type x: `np.ndarray` or `pandas.DataFrame` - :return: Predictions from the model. + :return: Predictions from the model (class probabilities, if supported). """ return self._art_model.predict(x, **kwargs) -class SklearnRegressor(SklearnModel, SingleOutputModel, ModelWithLoss): +class SklearnRegressor(SklearnModel): """ Wrapper class for scikitlearn regression models. """ - def __init__(self, model, **kwargs): + def __init__(self, model: BaseEstimator, **kwargs): """ Initialize a `SklearnRegressor` wrapper object. @@ -73,18 +72,16 @@ class SklearnRegressor(SklearnModel, SingleOutputModel, ModelWithLoss): super().__init__(model, **kwargs) self._art_model = ScikitlearnRegressor(model) - def fit(self, x: np.ndarray, y: np.ndarray, **kwargs) -> None: + def fit(self, train_data: BaseDataset, **kwargs) -> None: """ - Fit the model using the training data `(x, y)`. + Fit the model using the training data. - :param x: Training data. - :type x: `np.ndarray` or `pandas.DataFrame` - :param y: True labels. - :type y: `np.ndarray` or `pandas.DataFrame` + :param train_data: Training data. + :type train_data: `BaseDataset` """ - self._art_model.fit(x, y, **kwargs) + self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs) - def predict(self, x: np.ndarray, **kwargs) -> np.ndarray: + def predict(self, x: DATA_ARRAY_TYPE, **kwargs) -> DATA_ARRAY_TYPE: """ Perform predictions using the model for input `x`. @@ -93,50 +90,3 @@ class SklearnRegressor(SklearnModel, SingleOutputModel, ModelWithLoss): :return: Predictions from the model. """ return self._art_model.predict(x, **kwargs) - - def loss(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray: - """ - Compute the loss of the model for samples `x`. - - :param x: Input samples. - :type x: `np.ndarray` or `pandas.DataFrame` - :param y: True labels. - :type y: `np.ndarray` or `pandas.DataFrame` - :return: Loss values. - """ - return self._art_model.compute_loss(x, y, **kwargs) - - -# Probably not needed for now, as we will not be using these wrappers directly in ART. -# class SklearnDecisionTreeClassifier(SklearnClassifier, MultipleOutputModel): -# """ -# Wrapper class for scikitlearn decision tree classifier models. -# """ -# def __init__(self, model): -# """ -# Initialize a `DecisionTreeClassifier` wrapper object. -# -# :param model: The original sklearn decision tree model object -# """ -# super().__init__(model) -# self._art_model = ScikitlearnDecisionTreeClassifier(model) -# -# def get_decision_path(self, x: np.ndarray) -> np.ndarray: -# """ -# Returns the nodes along the path taken in the tree when classifying x. Last node is the leaf, first node is the -# root node. -# -# :param x: Input samples. -# :type x: `np.ndarray` or `pandas.DataFrame` -# :return: The indices of the nodes in the array structure of the tree. -# """ -# return self._art_model.get_decision_path(x) -# -# def get_samples_at_node(self, node_id: int) -> int: -# """ -# Returns the number of training samples mapped to a node. -# -# :param node_id: The ID of the node. -# :return: Number of samples mapped this node. -# """ -# return self._art_model.get_samples_at_node(node_id) diff --git a/requirements.txt b/requirements.txt index fa4131d..ec37771 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ numpy==1.21.0 pandas==1.1.0 scipy==1.4.1 scikit-learn==0.22.2 +adversarial-robustness-toolkit>=1.9.1 # testing pytest==5.4.2