This commit is contained in:
abigailt 2022-03-07 19:09:31 +02:00
parent 3d82db80c4
commit f2df2fcc8c
6 changed files with 35 additions and 43 deletions

View file

@ -3,4 +3,4 @@ The AI Privacy Toolbox (datasets).
Implementation of datasets utility components for datasets creation, load, and store Implementation of datasets utility components for datasets creation, load, and store
""" """
from apt.utils.datasets.datasets import DatasetABC, StoredDatasetABC, DatasetFactory, Data, BaseDataset, DATA_ARRAY_TYPE from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, DATA_ARRAY_TYPE

View file

@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame] DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame]
class DatasetABC(metaclass=ABCMeta): class Dataset(metaclass=ABCMeta):
"""Base Abstract Class for Dataset""" """Base Abstract Class for Dataset"""
@abstractmethod @abstractmethod
@ -38,7 +38,7 @@ class DatasetABC(metaclass=ABCMeta):
pass pass
class StoredDatasetABC(DatasetABC): class StoredDataset(Dataset):
"""Abstract Class for Storable Dataset""" """Abstract Class for Storable Dataset"""
@abstractmethod @abstractmethod
@ -73,7 +73,7 @@ class StoredDatasetABC(DatasetABC):
logger.info('Dataset Downloaded') logger.info('Dataset Downloaded')
if unzip: if unzip:
StoredDatasetABC.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False) StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
@staticmethod @staticmethod
@ -123,12 +123,12 @@ class StoredDatasetABC(DatasetABC):
np.savetxt(dest_datafile, debug_data, delimiter=delimiter, fmt=fmt) np.savetxt(dest_datafile, debug_data, delimiter=delimiter, fmt=fmt)
class BaseDataset(DatasetABC): class ArrayDataset(Dataset):
"""Base Class for Dataset""" """Dataset that is based on x and y arrays (e.g., numpy/pandas)"""
def __init__(self, x: DATA_ARRAY_TYPE, y: DATA_ARRAY_TYPE, **kwargs): def __init__(self, x: DATA_ARRAY_TYPE, y: DATA_ARRAY_TYPE, **kwargs):
""" """
BaseDataset constructor. ArrayDataset constructor.
:param x: collection of data samples :param x: collection of data samples
:param y: collection of labels :param y: collection of labels
:param kwargs: dataset parameters :param kwargs: dataset parameters
@ -159,7 +159,7 @@ class DatasetFactory:
:param name: dataset name :param name: dataset name
:return: :return:
""" """
def inner_wrapper(wrapped_class: DatasetABC) -> Any: def inner_wrapper(wrapped_class: Dataset) -> Any:
if name in cls.registry: if name in cls.registry:
logger.warning('Dataset %s already exists. Will replace it', name) logger.warning('Dataset %s already exists. Will replace it', name)
cls.registry[name] = wrapped_class cls.registry[name] = wrapped_class
@ -168,7 +168,7 @@ class DatasetFactory:
return inner_wrapper return inner_wrapper
@classmethod @classmethod
def create_dataset(cls, name: str, **kwargs) -> DatasetABC: def create_dataset(cls, name: str, **kwargs) -> Dataset:
""" """
Factory command to create dataset instance. Factory command to create dataset instance.
This method gets the appropriate Dataset class from the registry This method gets the appropriate Dataset class from the registry
@ -190,7 +190,7 @@ class DatasetFactory:
class Data: class Data:
def __init__(self, train: DatasetABC = None, test: DatasetABC = None, **kwargs): def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
""" """
Data class constructor. Data class constructor.
The class stores train and test datasets. The class stores train and test datasets.
@ -205,11 +205,11 @@ class Data:
self.train = DatasetFactory.create_dataset(train=True, **kwargs) self.train = DatasetFactory.create_dataset(train=True, **kwargs)
self.test = DatasetFactory.create_dataset(train=False, **kwargs) self.test = DatasetFactory.create_dataset(train=False, **kwargs)
def get_train_set(self) -> DatasetABC: def get_train_set(self) -> Dataset:
"""Return train DatasetBase""" """Return train DatasetBase"""
return self.train return self.train
def get_test_set(self) -> DatasetABC: def get_test_set(self) -> Dataset:
"""Return test DatasetBase""" """Return test DatasetBase"""
return self.test return self.test

View file

@ -1,2 +1,2 @@
from apt.utils.models.model import Model, ModelWithLoss, SingleOutputModel, MultipleOutputModel from apt.utils.models.model import Model
from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor

View file

@ -1,7 +1,7 @@
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from typing import Any from typing import Any
from apt.utils.datasets import BaseDataset, DATA_ARRAY_TYPE from apt.utils.datasets import Dataset, DATA_ARRAY_TYPE
class Model(metaclass=ABCMeta): class Model(metaclass=ABCMeta):
@ -18,12 +18,12 @@ class Model(metaclass=ABCMeta):
self._model = model self._model = model
@abstractmethod @abstractmethod
def fit(self, train_data: BaseDataset, **kwargs) -> None: def fit(self, train_data: Dataset, **kwargs) -> None:
""" """
Fit the model using the training data. Fit the model using the training data.
:param train_data: Training data. :param train_data: Training data.
:type train_data: `BaseDataset` :type train_data: `Dataset`
""" """
raise NotImplementedError raise NotImplementedError

View file

@ -4,7 +4,7 @@ from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from apt.utils.models import Model from apt.utils.models import Model
from apt.utils.datasets import BaseDataset, DATA_ARRAY_TYPE from apt.utils.datasets import Dataset, DATA_ARRAY_TYPE
from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
from art.estimators.regression.scikitlearn import ScikitlearnRegressor from art.estimators.regression.scikitlearn import ScikitlearnRegressor
@ -14,12 +14,12 @@ class SklearnModel(Model):
""" """
Wrapper class for scikitlearn models. Wrapper class for scikitlearn models.
""" """
def score(self, test_data: BaseDataset, **kwargs): def score(self, test_data: Dataset, **kwargs):
""" """
Score the model using test data. Score the model using test data.
:param test_data: Test data. :param test_data: Test data.
:type train_data: `BaseDataset` :type train_data: `Dataset`
""" """
return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs) return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs)
@ -37,12 +37,12 @@ class SklearnClassifier(SklearnModel):
super().__init__(model, **kwargs) super().__init__(model, **kwargs)
self._art_model = ArtSklearnClassifier(model) self._art_model = ArtSklearnClassifier(model)
def fit(self, train_data: BaseDataset, **kwargs) -> None: def fit(self, train_data: Dataset, **kwargs) -> None:
""" """
Fit the model using the training data. Fit the model using the training data.
:param train_data: Training data. :param train_data: Training data.
:type train_data: `BaseDataset` :type train_data: `Dataset`
""" """
encoder = OneHotEncoder(sparse=False) encoder = OneHotEncoder(sparse=False)
y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1)) y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
@ -72,12 +72,12 @@ class SklearnRegressor(SklearnModel):
super().__init__(model, **kwargs) super().__init__(model, **kwargs)
self._art_model = ScikitlearnRegressor(model) self._art_model = ScikitlearnRegressor(model)
def fit(self, train_data: BaseDataset, **kwargs) -> None: def fit(self, train_data: Dataset, **kwargs) -> None:
""" """
Fit the model using the training data. Fit the model using the training data.
:param train_data: Training data. :param train_data: Training data.
:type train_data: `BaseDataset` :type train_data: `Dataset`
""" """
self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs) self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)

View file

@ -1,44 +1,36 @@
import pytest import pytest
from apt.utils.models import SklearnClassifier, SklearnRegressor from apt.utils.models import SklearnClassifier, SklearnRegressor
from apt.utils.datasets import ArrayDataset
from apt.utils import dataset_utils from apt.utils import dataset_utils
from sklearn.tree import DecisionTreeRegressor from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
def test_sklearn_classifier(): def test_sklearn_classifier():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset() (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset()
underlying_model = RandomForestClassifier() underlying_model = RandomForestClassifier()
model = SklearnClassifier(underlying_model) model = SklearnClassifier(underlying_model)
model.fit(x_train, y_train) train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
model.fit(train)
pred = model.predict(x_test) pred = model.predict(x_test)
assert(pred.shape[0] == x_test.shape[0]) assert(pred.shape[0] == x_test.shape[0])
score = model.score(x_test, y_test) score = model.score(test)
assert(0.0 <= score <= 1.0) assert(0.0 <= score <= 1.0)
def test_sklearn_regressor(): def test_sklearn_regressor():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset() (x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset()
underlying_model = DecisionTreeRegressor() underlying_model = DecisionTreeRegressor()
model = SklearnRegressor(underlying_model) model = SklearnRegressor(underlying_model)
model.fit(x_train, y_train) train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
model.fit(train)
pred = model.predict(x_test) pred = model.predict(x_test)
assert (pred.shape[0] == x_test.shape[0]) assert (pred.shape[0] == x_test.shape[0])
score = model.score(x_test, y_test) score = model.score(test)
assert (0 <= score <= 1)
losses = model.loss(x_test, y_test)
assert (losses.shape[0] == x_test.shape[0])
# Probably not needed for now, as we will not be using these wrappers directly in ART.
# def test_sklearn_decision_tree():
# (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset()
# underlying_model = DecisionTreeClassifier()
# model = SklearnDecisionTreeClassifier(underlying_model)
# model.fit(x_train, y_train)
# pred = model.predict(x_test)
# assert(pred.shape[0] == x_test.shape[0])
#
# score = model.score(x_test, y_test)
# assert(0.0 <= score <= 1.0)