diff --git a/apt/utils/__init__.py b/apt/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apt/utils.py b/apt/utils/dataset_utils.py similarity index 94% rename from apt/utils.py rename to apt/utils/dataset_utils.py index 005c45b..f99c6cc 100644 --- a/apt/utils.py +++ b/apt/utils/dataset_utils.py @@ -13,8 +13,7 @@ def _load_iris(test_set_size: float = 0.3): # Split training and test sets x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size, - random_state=18, stratify=labels, - shuffle=True) + random_state=18, stratify=labels) return (x_train, y_train), (x_test, y_test) @@ -29,6 +28,28 @@ def get_iris_dataset(): return _load_iris() +def _load_diabetes(test_set_size: float = 0.3): + diabetes = datasets.load_diabetes() + data = diabetes.data + labels = diabetes.target + + # Split training and test sets + x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size, + random_state=18) + + return (x_train, y_train), (x_test, y_test) + + +def get_diabetes_dataset(): + """ + Loads the Iris dataset from scikit-learn. + + :param test_set: Proportion of the data to use as validation split (value between 0 and 1). + :return: Entire dataset and labels as numpy array. + """ + return _load_diabetes() + + def get_german_credit_dataset(test_set: float = 0.3): """ Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary. diff --git a/apt/utils/models/__init__.py b/apt/utils/models/__init__.py new file mode 100644 index 0000000..9f48d82 --- /dev/null +++ b/apt/utils/models/__init__.py @@ -0,0 +1,2 @@ +from apt.utils.models.model import Model, ModelWithLoss, SingleOutputModel, MultipleOutputModel +from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor diff --git a/apt/utils/models/model.py b/apt/utils/models/model.py new file mode 100644 index 0000000..f1fdce6 --- /dev/null +++ b/apt/utils/models/model.py @@ -0,0 +1,112 @@ +from abc import ABC, abstractmethod +from typing import Union, List, Any, Optional +import numpy as np + +class Model(ABC): + """ + Base class for ML model wrappers. + """ + + def __init__(self, model: Any, **kwargs): + """ + Initialize a `Model` wrapper object. + + :param model: The original model object (of the underlying ML framework) + """ + self._model = model + + @abstractmethod + def fit(self, x: np.ndarray, y: np.ndarray, **kwargs) -> None: + """ + Fit the model using the training data `(x, y)`. + + :param x: Training data. + :type x: `np.ndarray` or `pandas.DataFrame` + :param y: True labels. + :type y: `np.ndarray` or `pandas.DataFrame` + """ + raise NotImplementedError + + @abstractmethod + def predict(self, x: np.ndarray, **kwargs) -> np.ndarray: + """ + Perform predictions using the model for input `x`. + + :param x: Input samples. + :type x: `np.ndarray` or `pandas.DataFrame` + :return: Predictions from the model. + """ + raise NotImplementedError + + @property + def model(self): + """ + Return the model. + + :return: The model. + """ + return self._model + + +class SingleOutputModel(Model): + """ + Wrapper class for ML models whose output is a single value (e.g., classification with label only output, regression). + """ + + +class MultipleOutputModel(Model): + """ + Wrapper class for ML models whose output is a vector (e.g., class probabilities or logits). + """ + + +class ModelWithLoss(Model): + """ + Wrapper class for ML models that support computing loss values for predictions. + """ + + def __init__(self, model: Any, loss: Optional[Any] = None, **kwargs): + """ + Initialize a `ModelWithLoss` wrapper object. + + :param model: The original model object (of the underlying ML framework) + :param loss: The loss function/object of the model (of the underlying ML framework) + """ + super().__init__(model, **kwargs) + self._loss = loss + + + # Probably not needed for now, as we will not be using these wrappers directly in ART. + # @abstractmethod + # def loss(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray: + # """ + # Compute the loss of the model for samples `x`. + # + # :param x: Input samples. + # :type x: `np.ndarray` or `pandas.DataFrame` + # :param y: True labels. + # :type y: `np.ndarray` or `pandas.DataFrame` + # :return: Loss values. + # """ + # raise NotImplementedError + + +# Probably not needed for now, as we will not be using these wrappers directly in ART. +# class ModelWithGradients(Model): +# """ +# Wrapper class for ML models that support computing gradients. +# """ +# @abstractmethod +# def class_gradient(self, x: np.ndarray, label: Union[int, List[int], None] = None, **kwargs) -> np.ndarray: +# """ +# Compute per-class derivatives w.r.t. input `x`. +# +# :param x: Input samples. +# :type x: `np.ndarray` or `pandas.DataFrame` +# :param label: Index of a specific class. If provided, the gradient of the specified class +# is computed for all samples. Otherwise, gradients for all classes are computed for all samples. +# :param label: int +# :return: Gradients of input features w.r.t. each class in the form `(batch_size, nb_classes, input_shape)` when +# computing for all classes, or `(batch_size, 1, input_shape)` when `label` is specified. +# """ +# raise NotImplementedError diff --git a/apt/utils/models/sklearn_model.py b/apt/utils/models/sklearn_model.py new file mode 100644 index 0000000..dffa356 --- /dev/null +++ b/apt/utils/models/sklearn_model.py @@ -0,0 +1,142 @@ +import numpy as np +from sklearn.preprocessing import OneHotEncoder + +from apt.utils.models import Model, ModelWithLoss, SingleOutputModel + +from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier +from art.estimators.regression.scikitlearn import ScikitlearnRegressor + + +class SklearnModel(Model): + """ + Wrapper class for scikitlearn models. + """ + def score(self, x: np.ndarray, y: np.ndarray, **kwargs): + """ + Score the model using test data `(x, y)`. + + :param x: Test data. + :type x: `np.ndarray` or `pandas.DataFrame` + :param y: True labels. + :type y: `np.ndarray` or `pandas.DataFrame` + """ + return self.model.score(x, y, **kwargs) + + +class SklearnClassifier(SklearnModel): + """ + Wrapper class for scikitlearn classification models. + """ + def __init__(self, model, **kwargs): + """ + Initialize a `SklearnClassifier` wrapper object. + + :param model: The original sklearn model object + """ + super().__init__(model, **kwargs) + self._art_model = ArtSklearnClassifier(model) + + def fit(self, x: np.ndarray, y: np.ndarray, **kwargs) -> None: + """ + Fit the model using the training data `(x, y)`. + + :param x: Training data. + :type x: `np.ndarray` or `pandas.DataFrame` + :param y: True labels. + :type y: `np.ndarray` or `pandas.DataFrame` + """ + encoder = OneHotEncoder(sparse=False) + y_encoded = encoder.fit_transform(y.reshape(-1, 1)) + self._art_model.fit(x, y_encoded, **kwargs) + + def predict(self, x: np.ndarray, **kwargs) -> np.ndarray: + """ + Perform predictions using the model for input `x`. + + :param x: Input samples. + :type x: `np.ndarray` or `pandas.DataFrame` + :return: Predictions from the model. + """ + return self._art_model.predict(x, **kwargs) + + +class SklearnRegressor(SklearnModel, SingleOutputModel, ModelWithLoss): + """ + Wrapper class for scikitlearn regression models. + """ + def __init__(self, model, **kwargs): + """ + Initialize a `SklearnRegressor` wrapper object. + + :param model: The original sklearn model object + """ + super().__init__(model, **kwargs) + self._art_model = ScikitlearnRegressor(model) + + def fit(self, x: np.ndarray, y: np.ndarray, **kwargs) -> None: + """ + Fit the model using the training data `(x, y)`. + + :param x: Training data. + :type x: `np.ndarray` or `pandas.DataFrame` + :param y: True labels. + :type y: `np.ndarray` or `pandas.DataFrame` + """ + self._art_model.fit(x, y, **kwargs) + + def predict(self, x: np.ndarray, **kwargs) -> np.ndarray: + """ + Perform predictions using the model for input `x`. + + :param x: Input samples. + :type x: `np.ndarray` or `pandas.DataFrame` + :return: Predictions from the model. + """ + return self._art_model.predict(x, **kwargs) + + def loss(self, x: np.ndarray, y: np.ndarray, **kwargs) -> np.ndarray: + """ + Compute the loss of the model for samples `x`. + + :param x: Input samples. + :type x: `np.ndarray` or `pandas.DataFrame` + :param y: True labels. + :type y: `np.ndarray` or `pandas.DataFrame` + :return: Loss values. + """ + return self._art_model.compute_loss(x, y, **kwargs) + + +# Probably not needed for now, as we will not be using these wrappers directly in ART. +# class SklearnDecisionTreeClassifier(SklearnClassifier, MultipleOutputModel): +# """ +# Wrapper class for scikitlearn decision tree classifier models. +# """ +# def __init__(self, model): +# """ +# Initialize a `DecisionTreeClassifier` wrapper object. +# +# :param model: The original sklearn decision tree model object +# """ +# super().__init__(model) +# self._art_model = ScikitlearnDecisionTreeClassifier(model) +# +# def get_decision_path(self, x: np.ndarray) -> np.ndarray: +# """ +# Returns the nodes along the path taken in the tree when classifying x. Last node is the leaf, first node is the +# root node. +# +# :param x: Input samples. +# :type x: `np.ndarray` or `pandas.DataFrame` +# :return: The indices of the nodes in the array structure of the tree. +# """ +# return self._art_model.get_decision_path(x) +# +# def get_samples_at_node(self, node_id: int) -> int: +# """ +# Returns the number of training samples mapped to a node. +# +# :param node_id: The ID of the node. +# :return: Number of samples mapped this node. +# """ +# return self._art_model.get_samples_at_node(node_id) diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py index 466c129..4a96b9a 100644 --- a/tests/test_anonymizer.py +++ b/tests/test_anonymizer.py @@ -4,7 +4,7 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.preprocessing import OneHotEncoder from apt.anonymization import Anonymize -from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset +from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 3ed7fa6..f39cf10 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -12,7 +12,7 @@ from sklearn.preprocessing import OneHotEncoder, StandardScaler from apt.minimization import GeneralizeToRepresentative from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset +from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset @pytest.fixture diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 0000000..d1dc6eb --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,44 @@ +import pytest + +from apt.utils.models import SklearnClassifier, SklearnRegressor +from apt.utils import dataset_utils + +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import RandomForestClassifier + +def test_sklearn_classifier(): + (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset() + underlying_model = RandomForestClassifier() + model = SklearnClassifier(underlying_model) + model.fit(x_train, y_train) + pred = model.predict(x_test) + assert(pred.shape[0] == x_test.shape[0]) + + score = model.score(x_test, y_test) + assert(0.0 <= score <= 1.0) + +def test_sklearn_regressor(): + (x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset() + underlying_model = DecisionTreeRegressor() + model = SklearnRegressor(underlying_model) + model.fit(x_train, y_train) + pred = model.predict(x_test) + assert (pred.shape[0] == x_test.shape[0]) + + score = model.score(x_test, y_test) + + losses = model.loss(x_test, y_test) + assert (losses.shape[0] == x_test.shape[0]) + + +# Probably not needed for now, as we will not be using these wrappers directly in ART. +# def test_sklearn_decision_tree(): +# (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset() +# underlying_model = DecisionTreeClassifier() +# model = SklearnDecisionTreeClassifier(underlying_model) +# model.fit(x_train, y_train) +# pred = model.predict(x_test) +# assert(pred.shape[0] == x_test.shape[0]) +# +# score = model.score(x_test, y_test) +# assert(0.0 <= score <= 1.0)