diff --git a/apt/utils/datasets/__init__.py b/apt/utils/datasets/__init__.py index 09f38a4..6e7c640 100644 --- a/apt/utils/datasets/__init__.py +++ b/apt/utils/datasets/__init__.py @@ -3,4 +3,5 @@ The AI Privacy Toolbox (datasets). Implementation of datasets utility components for datasets creation, load, and store """ -from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, DATA_ARRAY_TYPE +from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, \ + OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE diff --git a/apt/utils/datasets/datasets.py b/apt/utils/datasets/datasets.py index a164ba4..aff591f 100644 --- a/apt/utils/datasets/datasets.py +++ b/apt/utils/datasets/datasets.py @@ -5,7 +5,7 @@ Implementation of utility classes for dataset handling """ from abc import ABCMeta, abstractmethod -from typing import Callable, Collection, Any, Union +from typing import Callable, Collection, Any, Union, List, Optional import tarfile import os @@ -13,11 +13,14 @@ import urllib.request import numpy as np import pandas as pd import logging +from torch import Tensor logger = logging.getLogger(__name__) -DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame] +INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor] +OUTPUT_DATA_ARRAY_TYPE = np.ndarray +DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame] class Dataset(metaclass=ABCMeta): @@ -124,28 +127,50 @@ class StoredDataset(Dataset): class ArrayDataset(Dataset): - """Dataset that is based on x and y arrays (e.g., numpy/pandas)""" + """Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)""" - def __init__(self, x: DATA_ARRAY_TYPE, y: DATA_ARRAY_TYPE, **kwargs): + def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs): """ ArrayDataset constructor. :param x: collection of data samples - :param y: collection of labels + :param y: collection of labels (optional) :param kwargs: dataset parameters """ - self.x = x - self.y = y + # convert to numpy + if type(x) == np.ndarray: + self._x = x + elif type(x) == pd.DataFrame: + self._x = x.to_numpy() + elif isinstance(x, list): + self._x = np.array(x) + elif type(x) == Tensor: + self._x = x.numpy() + else: + raise ValueError('Non supported type for x: ', type(x).__name__) - if len(self.x) != len(self.y): + self._y = None + if y is not None: + if type(y) == np.ndarray: + self._y = y + elif type(y) == pd.DataFrame: + self._y = y.to_numpy() + elif isinstance(y, list): + self._y = np.array(y) + elif type(y) == Tensor: + self._y = y.numpy() + else: + raise ValueError('Non supported type for y: ', type(y).__name__) + + if y is not None and len(self._x) != len(self._y): raise ValueError('Non equivalent lengths of x and y') - def get_samples(self) -> DATA_ARRAY_TYPE: - """Return data samples""" - return self.x + def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE: + """Return data samples as numpy array""" + return self._x - def get_labels(self) -> DATA_ARRAY_TYPE: - """Return labels""" - return self.y + def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE: + """Return labels as numpy array""" + return self._y class DatasetFactory: @@ -189,7 +214,6 @@ class DatasetFactory: class Data: - def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs): """ Data class constructor. diff --git a/apt/utils/models/__init__.py b/apt/utils/models/__init__.py index dc50790..11efd5f 100644 --- a/apt/utils/models/__init__.py +++ b/apt/utils/models/__init__.py @@ -1,2 +1,2 @@ -from apt.utils.models.model import Model +from apt.utils.models.model import Model, ModelOutputType from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor diff --git a/apt/utils/models/model.py b/apt/utils/models/model.py index 1ef13ad..829725b 100644 --- a/apt/utils/models/model.py +++ b/apt/utils/models/model.py @@ -1,7 +1,14 @@ from abc import ABCMeta, abstractmethod from typing import Any +from enum import Enum, auto -from apt.utils.datasets import Dataset, DATA_ARRAY_TYPE +from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE + + +class ModelOutputType(Enum): + CLASSIFIER_VECTOR = auto() # probabilities or logits + CLASSIFIER_SCALAR = auto() # label only + REGRESSOR_SCALAR = auto() # value class Model(metaclass=ABCMeta): @@ -9,13 +16,16 @@ class Model(metaclass=ABCMeta): Abstract base class for ML model wrappers. """ - def __init__(self, model: Any, **kwargs): + def __init__(self, model: Any, output_type: ModelOutputType, **kwargs): """ Initialize a `Model` wrapper object. :param model: The original model object (of the underlying ML framework) + :param output_type: The type of output the model yields (vector/label only for classifiers, + value for regressors) """ self._model = model + self._output_type = output_type @abstractmethod def fit(self, train_data: Dataset, **kwargs) -> None: @@ -28,7 +38,7 @@ class Model(metaclass=ABCMeta): raise NotImplementedError @abstractmethod - def predict(self, x: DATA_ARRAY_TYPE, **kwargs) -> DATA_ARRAY_TYPE: + def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE: """ Perform predictions using the model for input `x`. @@ -39,10 +49,19 @@ class Model(metaclass=ABCMeta): raise NotImplementedError @property - def model(self): + def model(self) -> Any: """ Return the model. :return: The model. """ return self._model + + @property + def output_type(self) -> ModelOutputType: + """ + Return the model's output type. + + :return: The model's output type. + """ + return self._output_type diff --git a/apt/utils/models/sklearn_model.py b/apt/utils/models/sklearn_model.py index 731a554..91c6280 100644 --- a/apt/utils/models/sklearn_model.py +++ b/apt/utils/models/sklearn_model.py @@ -3,8 +3,8 @@ import numpy as np from sklearn.preprocessing import OneHotEncoder from sklearn.base import BaseEstimator -from apt.utils.models import Model -from apt.utils.datasets import Dataset, DATA_ARRAY_TYPE +from apt.utils.models import Model, ModelOutputType +from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier from art.estimators.regression.scikitlearn import ScikitlearnRegressor @@ -28,13 +28,13 @@ class SklearnClassifier(SklearnModel): """ Wrapper class for scikitlearn classification models. """ - def __init__(self, model: BaseEstimator, **kwargs): + def __init__(self, model: BaseEstimator, output_type: ModelOutputType, **kwargs): """ Initialize a `SklearnClassifier` wrapper object. :param model: The original sklearn model object """ - super().__init__(model, **kwargs) + super().__init__(model, output_type, **kwargs) self._art_model = ArtSklearnClassifier(model) def fit(self, train_data: Dataset, **kwargs) -> None: @@ -48,7 +48,7 @@ class SklearnClassifier(SklearnModel): y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1)) self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs) - def predict(self, x: DATA_ARRAY_TYPE, **kwargs) -> DATA_ARRAY_TYPE: + def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE: """ Perform predictions using the model for input `x`. @@ -69,7 +69,7 @@ class SklearnRegressor(SklearnModel): :param model: The original sklearn model object """ - super().__init__(model, **kwargs) + super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, **kwargs) self._art_model = ScikitlearnRegressor(model) def fit(self, train_data: Dataset, **kwargs) -> None: @@ -81,7 +81,7 @@ class SklearnRegressor(SklearnModel): """ self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs) - def predict(self, x: DATA_ARRAY_TYPE, **kwargs) -> DATA_ARRAY_TYPE: + def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE: """ Perform predictions using the model for input `x`. diff --git a/tests/test_model.py b/tests/test_model.py index 7dd151c..8041f04 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,6 +1,6 @@ import pytest -from apt.utils.models import SklearnClassifier, SklearnRegressor +from apt.utils.models import SklearnClassifier, SklearnRegressor, ModelOutputType from apt.utils.datasets import ArrayDataset from apt.utils import dataset_utils @@ -11,7 +11,7 @@ from sklearn.ensemble import RandomForestClassifier def test_sklearn_classifier(): (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset() underlying_model = RandomForestClassifier() - model = SklearnClassifier(underlying_model) + model = SklearnClassifier(underlying_model, ModelOutputType.CLASSIFIER_VECTOR) train = ArrayDataset(x_train, y_train) test = ArrayDataset(x_test, y_test) model.fit(train)