This commit is contained in:
abigailt 2022-03-07 19:09:31 +02:00
parent 3d82db80c4
commit f2df2fcc8c
6 changed files with 35 additions and 43 deletions

View file

@ -3,4 +3,4 @@ The AI Privacy Toolbox (datasets).
Implementation of datasets utility components for datasets creation, load, and store
"""
from apt.utils.datasets.datasets import DatasetABC, StoredDatasetABC, DatasetFactory, Data, BaseDataset, DATA_ARRAY_TYPE
from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, DATA_ARRAY_TYPE

View file

@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame]
class DatasetABC(metaclass=ABCMeta):
class Dataset(metaclass=ABCMeta):
"""Base Abstract Class for Dataset"""
@abstractmethod
@ -38,7 +38,7 @@ class DatasetABC(metaclass=ABCMeta):
pass
class StoredDatasetABC(DatasetABC):
class StoredDataset(Dataset):
"""Abstract Class for Storable Dataset"""
@abstractmethod
@ -73,7 +73,7 @@ class StoredDatasetABC(DatasetABC):
logger.info('Dataset Downloaded')
if unzip:
StoredDatasetABC.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
@staticmethod
@ -123,12 +123,12 @@ class StoredDatasetABC(DatasetABC):
np.savetxt(dest_datafile, debug_data, delimiter=delimiter, fmt=fmt)
class BaseDataset(DatasetABC):
"""Base Class for Dataset"""
class ArrayDataset(Dataset):
"""Dataset that is based on x and y arrays (e.g., numpy/pandas)"""
def __init__(self, x: DATA_ARRAY_TYPE, y: DATA_ARRAY_TYPE, **kwargs):
"""
BaseDataset constructor.
ArrayDataset constructor.
:param x: collection of data samples
:param y: collection of labels
:param kwargs: dataset parameters
@ -159,7 +159,7 @@ class DatasetFactory:
:param name: dataset name
:return:
"""
def inner_wrapper(wrapped_class: DatasetABC) -> Any:
def inner_wrapper(wrapped_class: Dataset) -> Any:
if name in cls.registry:
logger.warning('Dataset %s already exists. Will replace it', name)
cls.registry[name] = wrapped_class
@ -168,7 +168,7 @@ class DatasetFactory:
return inner_wrapper
@classmethod
def create_dataset(cls, name: str, **kwargs) -> DatasetABC:
def create_dataset(cls, name: str, **kwargs) -> Dataset:
"""
Factory command to create dataset instance.
This method gets the appropriate Dataset class from the registry
@ -190,7 +190,7 @@ class DatasetFactory:
class Data:
def __init__(self, train: DatasetABC = None, test: DatasetABC = None, **kwargs):
def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
"""
Data class constructor.
The class stores train and test datasets.
@ -205,11 +205,11 @@ class Data:
self.train = DatasetFactory.create_dataset(train=True, **kwargs)
self.test = DatasetFactory.create_dataset(train=False, **kwargs)
def get_train_set(self) -> DatasetABC:
def get_train_set(self) -> Dataset:
"""Return train DatasetBase"""
return self.train
def get_test_set(self) -> DatasetABC:
def get_test_set(self) -> Dataset:
"""Return test DatasetBase"""
return self.test

View file

@ -1,2 +1,2 @@
from apt.utils.models.model import Model, ModelWithLoss, SingleOutputModel, MultipleOutputModel
from apt.utils.models.model import Model
from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor

View file

@ -1,7 +1,7 @@
from abc import ABCMeta, abstractmethod
from typing import Any
from apt.utils.datasets import BaseDataset, DATA_ARRAY_TYPE
from apt.utils.datasets import Dataset, DATA_ARRAY_TYPE
class Model(metaclass=ABCMeta):
@ -18,12 +18,12 @@ class Model(metaclass=ABCMeta):
self._model = model
@abstractmethod
def fit(self, train_data: BaseDataset, **kwargs) -> None:
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data.
:type train_data: `BaseDataset`
:type train_data: `Dataset`
"""
raise NotImplementedError

View file

@ -4,7 +4,7 @@ from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
from apt.utils.models import Model
from apt.utils.datasets import BaseDataset, DATA_ARRAY_TYPE
from apt.utils.datasets import Dataset, DATA_ARRAY_TYPE
from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
from art.estimators.regression.scikitlearn import ScikitlearnRegressor
@ -14,12 +14,12 @@ class SklearnModel(Model):
"""
Wrapper class for scikitlearn models.
"""
def score(self, test_data: BaseDataset, **kwargs):
def score(self, test_data: Dataset, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `BaseDataset`
:type train_data: `Dataset`
"""
return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs)
@ -37,12 +37,12 @@ class SklearnClassifier(SklearnModel):
super().__init__(model, **kwargs)
self._art_model = ArtSklearnClassifier(model)
def fit(self, train_data: BaseDataset, **kwargs) -> None:
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data.
:type train_data: `BaseDataset`
:type train_data: `Dataset`
"""
encoder = OneHotEncoder(sparse=False)
y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
@ -72,12 +72,12 @@ class SklearnRegressor(SklearnModel):
super().__init__(model, **kwargs)
self._art_model = ScikitlearnRegressor(model)
def fit(self, train_data: BaseDataset, **kwargs) -> None:
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data.
:type train_data: `BaseDataset`
:type train_data: `Dataset`
"""
self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)

View file

@ -1,44 +1,36 @@
import pytest
from apt.utils.models import SklearnClassifier, SklearnRegressor
from apt.utils.datasets import ArrayDataset
from apt.utils import dataset_utils
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
def test_sklearn_classifier():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset()
underlying_model = RandomForestClassifier()
model = SklearnClassifier(underlying_model)
model.fit(x_train, y_train)
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
model.fit(train)
pred = model.predict(x_test)
assert(pred.shape[0] == x_test.shape[0])
score = model.score(x_test, y_test)
score = model.score(test)
assert(0.0 <= score <= 1.0)
def test_sklearn_regressor():
(x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset()
underlying_model = DecisionTreeRegressor()
model = SklearnRegressor(underlying_model)
model.fit(x_train, y_train)
train = ArrayDataset(x_train, y_train)
test = ArrayDataset(x_test, y_test)
model.fit(train)
pred = model.predict(x_test)
assert (pred.shape[0] == x_test.shape[0])
score = model.score(x_test, y_test)
losses = model.loss(x_test, y_test)
assert (losses.shape[0] == x_test.shape[0])
# Probably not needed for now, as we will not be using these wrappers directly in ART.
# def test_sklearn_decision_tree():
# (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset()
# underlying_model = DecisionTreeClassifier()
# model = SklearnDecisionTreeClassifier(underlying_model)
# model.fit(x_train, y_train)
# pred = model.predict(x_test)
# assert(pred.shape[0] == x_test.shape[0])
#
# score = model.score(x_test, y_test)
# assert(0.0 <= score <= 1.0)
score = model.score(test)
assert (0 <= score <= 1)