Documentation updates (#29)

* Bump version to 0.1.0 (breaking changes to some APIs)

* Update documentation

* Update requirements

* gitignore
This commit is contained in:
abigailgold 2022-05-02 11:46:18 +03:00 committed by GitHub
parent 014aed9670
commit fd6be8e778
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 640 additions and 298 deletions

160
.gitignore vendored Normal file
View file

@ -0,0 +1,160 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

View file

@ -19,27 +19,25 @@ class Anonymize:
Based on the implementation described in: https://arxiv.org/abs/2007.13086
Parameters
----------
k : int
The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
quasi_identifiers : np.ndarray or list
The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
categorical_features : list, optional
The list of categorical features (should only be supplied when passing data as a
pandas dataframe.
is_regression : Bool, optional
Whether the model is a regression model or not (if False, assumes
a classification model). Default is False.
train_only_QI : Bool, optional
The required method to train data set for anonymization. Default is
to train the tree on all features.
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
:type k: int
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
:type quasi_identifiers: np.ndarray or list
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
before using them to train the decision tree model).
:type categorical_features: list, optional
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
Default is False.
:type is_regression: list, optional
:param train_only_QI: The required method to train data set for anonymization. Default is
to train the tree on all features.
:type train_only_QI: boolean, optional
"""
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
is_regression=False, train_only_QI=False):
is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
if k < 2:
raise ValueError("k should be a positive integer with a value of 2 or higher")
if quasi_identifiers is None or len(quasi_identifiers) < 1:
@ -59,7 +57,9 @@ class Anonymize:
:param dataset: Data wrapper containing the training data for the model and the predictions of the
original model on the training data.
:return: An array containing the anonymized training dataset.
:type dataset: `ArrayDataset`
:return: The anonymized training dataset as either numpy array or pandas DataFrame (depending on the type of
the original data used to create the ArrayDataset).
"""
if dataset.get_samples().shape[1] != 0:
self.features = [i for i in range(dataset.get_samples().shape[1])]

View file

@ -21,7 +21,8 @@ from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnCl
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
""" A transformer that generalizes data to representative points.
"""
A transformer that generalizes data to representative points.
Learns data generalizations based on an original model's predictions
and a target accuracy. Once the generalizations are learned, can
@ -34,52 +35,33 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
need to supply an existing ``estimator`` to init.
In summary, either ``estimator`` and ``target_accuracy`` should be
supplied or ``cells`` should be supplied.
Parameters
----------
estimator : estimator, optional
The original model for which generalization is being performed.
Should be pre-fitted.
target_accuracy : float, optional
The required accuracy when applying the base model to the
generalized data. Accuracy is measured relative to the original
accuracy of the model.
categorical_features: list of str, optional
The list of categorical features should only be supplied when
passing data as a pandas dataframe.
features_to_minimize: List of str or numbers, optional
The features that need to be minimized in case of pandas data,
and indexes of features in case of numpy data.
cells : list of object, optional
The cells used to generalize records. Each cell must define a
range or subset of categories for each feature, as well as a
representative value for each feature.
This parameter should be used when instantiating a transformer
object without first fitting it.
train_only_QI : Bool, optional
The required method to train data set for minimizing. Default is
to train the tree just on the features that are given as
features_to_minimize.
is_regression : Bool, optional
Whether the model is a regression model or not (if False, assumes
a classification model). Default is False.
Attributes
----------
features_ : list of str
The feature names, in the order that they appear in the data.
cells_ : list of object
The cells used to generalize records, as learned when calling fit.
ncp_ : float
The NCP (information loss) score of the resulting generalization,
as measured on the training data.
generalizations_ : object
The generalizations that were learned (actual feature ranges).
:param estimator: The original model for which generalization is being performed. Should be pre-fitted.
:type estimator: sklearn `BaseEstimator` or `Model`
:param target_accuracy: The required relative accuracy when applying the base model to the generalized data.
Accuracy is measured relative to the original accuracy of the model.
:type target_accuracy: float, optional
:param cells: The cells used to generalize records. Each cell must define a range or subset of categories for
each feature, as well as a representative value for each feature. This parameter should be used
when instantiating a transformer object without first fitting it.
:type cells: list of objects, optional
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
encoded before using them to train the decision tree model).
:type categorical_features: list of strings, optional
:param features_to_minimize: The features to be minimized.
:type features_to_minimize: list of strings or int, optional
:param train_only_QI: Whether to train the tree just on the ``features_to_minimize`` or on all features. Default
is only on ``features_to_minimize``.
:type train_only_QI: boolean, optional
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
Default is False.
:type is_regression: boolean, optional
"""
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998,
cells: list = None, categorical_features: Union[np.ndarray, list] = None,
features_to_minimize: Union[np.ndarray, list] = None, train_only_QI: bool = True,
is_regression: bool = False):
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998,
cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None,
features_to_minimize: Optional[Union[np.ndarray, list]] = None, train_only_QI: Optional[bool] = True,
is_regression: Optional[bool] = False):
if issubclass(estimator.__class__, Model):
self.estimator = estimator
else:
@ -97,18 +79,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.is_regression = is_regression
def get_params(self, deep=True):
"""Get parameters for this estimator.
"""
Get parameters
Parameters
----------
deep : boolean, optional
If True, will return the parameters for this estimator and contained
subobjects that are estimators.
Returns
-------
params : mapping of string to any
Parameter names mapped to their values.
:param deep: If True, will return the parameters for this estimator and contained
sub-objects that are estimators.
:type deep: boolean, optional
:return: Parameter names mapped to their values
"""
ret = {}
ret['target_accuracy'] = self.target_accuracy
@ -120,12 +97,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return ret
def set_params(self, **params):
"""Set the parameters of this estimator.
"""
Set parameters
Returns
-------
self : object
Returns self.
:param target_accuracy: The required relative accuracy when applying the base model to the generalized data.
Accuracy is measured relative to the original accuracy of the model.
:type target_accuracy: float, optional
:param cells: The cells used to generalize records. Each cell must define a range or subset of categories for
each feature, as well as a representative value for each feature. This parameter should be used
when instantiating a transformer object without first fitting it.
:type cells: list of objects, optional
:return: self
"""
if 'target_accuracy' in params:
self.target_accuracy = params['target_accuracy']
@ -135,29 +117,32 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
@property
def generalizations(self):
"""
Return the generalizations derived from the model and test data.
:return: generalizations object. Contains 3 sections: 'ranges' that contains ranges for numerical features,
'categories' that contains sub-groups of categories for categorical features, and
'untouched' that contains the features that could not be generalized.
"""
return self.generalizations_
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional = None, dataset: Optional[ArrayDataset] = None):
"""Learns the generalizations based on training data, and applies them to the data.
features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
"""
Learns the generalizations based on training data, and applies them to the data.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
The training input samples.
y : array-like, shape (n_samples,), optional
The target values. An array of int.
This should contain the predictions of the original model on ``X``.
features_names : list of str, The feature names, in the order that they appear in the data,
provided just if X and y were provided (optional).
dataset : Data wrapper containing the training input samples and the predictions of the
original model on the training data.
Either X,y OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
:param y: The target values. This should contain the predictions of the original model on ``X``.
:type y: array-like, shape (n_samples,), optional
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
passing the data as ``X`` and ``y``
:type features_names: list of strings, optional
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
:type dataset: `ArrayDataset`, optional
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
"""
self.fit(X, y, features_names, dataset=dataset)
return self.transform(X, features_names, dataset=dataset)
@ -166,23 +151,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
features_names: Optional = None, dataset: ArrayDataset = None):
"""Learns the generalizations based on training data.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
The training input samples.
y : array-like, shape (n_samples,), optional
The target values. An array of int.
This should contain the predictions of the original model on ``X``.
features_names : list of str, The feature names, in the order that they appear in the data,
provided just if X and y were provided (optional).
dataset : Data wrapper containing the training input samples and the predictions of the
original model on the training data.
Either X,y OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
:param y: The target values. This should contain the predictions of the original model on ``X``.
:type y: array-like, shape (n_samples,), optional
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
passing the data as ``X`` and ``y``
:type features_names: list of strings, optional
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
:type dataset: `ArrayDataset`, optional
:return: self
"""
# take into account that estimator, X, y, cells, features may be None
@ -242,7 +221,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
used_X_train = X_train_QI
# collect feature data (such as min, max)
feature_data = {}
for feature in self._features:
if feature not in feature_data.keys():
@ -386,23 +364,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# Return the transformer
return self
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None):
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
dataset: Optional[ArrayDataset] = None):
""" Transforms data records to representative points.
Parameters
----------
X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
may contain both numeric and categorical data.
The input samples.
features_names : list of str, The feature names, in the order that they appear in the data,
provided just if X was provided (optional).
dataset : Data wrapper containing the training input samples.
Either X OR dataset need to be provided, not both.
Returns
-------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
The array containing the representative values to which each record in
``X`` is mapped.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
passing the data as ``X`` and ``y``
:type features_names: list of strings, optional
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X`` OR ``dataset`` need to be provided, not both.
:type dataset: `ArrayDataset`, optional
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
"""
# Check if fit has been called

View file

@ -6,6 +6,17 @@ from os import path, mkdir
from six.moves.urllib.request import urlretrieve
def get_iris_dataset(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1). Default is 0.3
:type test_set: float
:return: Entire dataset and labels as numpy arrays. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
return _load_iris(test_set)
def _load_iris(test_set_size: float = 0.3):
iris = datasets.load_iris()
data = iris.data
@ -18,14 +29,15 @@ def _load_iris(test_set_size: float = 0.3):
return (x_train, y_train), (x_test, y_test)
def get_iris_dataset(test_set: float = 0.3):
def get_diabetes_dataset(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
Loads the Diabetes dataset from scikit-learn.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Entire dataset and labels as numpy array.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1). Default is 0.3
:type test_set: float
:return: Entire dataset and labels as numpy arrays. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
return _load_iris(test_set)
return _load_diabetes(test_set)
def _load_diabetes(test_set_size: float = 0.3):
@ -40,22 +52,14 @@ def _load_diabetes(test_set_size: float = 0.3):
return (x_train, y_train), (x_test, y_test)
def get_diabetes_dataset():
"""
Loads the Iris dataset from scikit-learn.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Entire dataset and labels as numpy array.
"""
return _load_diabetes()
def get_german_credit_dataset(test_set: float = 0.3):
"""
Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
Loads the UCI German credit dataset from `tests/datasets/german` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/ if necessary.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Dataset and labels as pandas dataframes.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1). Default is 0.3
:type test_set: float
:return: Dataset and labels as pandas dataframes. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
@ -124,9 +128,10 @@ def _modify_german_dataset(data):
def get_adult_dataset():
"""
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary.
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ if necessary.
:return: Dataset and labels as pandas dataframes.
:return: Dataset and labels as pandas dataframes. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
@ -225,15 +230,20 @@ def _modify_adult_dataset(data):
def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
"""
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it if necessary.
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/ if necessary.
:param raw: `True` if no preprocessing should be applied to the data. Otherwise, categorical data is one-hot
encoded and data is scaled using sklearn's StandardScaler.
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1.
:type raw: boolean
:param test_set: Proportion of the data to use as validation split. The value should be between 0 and 1. Default is
0.2
:type test_set: float
:param transform_social: If `True`, transforms the social feature to be binary for the purpose of attribute
inference. This is done by assigning the original value 'problematic' the new value 1, and
the other original values are assigned the new value 0.
:return: Dataset and labels as pandas dataframes.
:type transform_social: boolean
:return: Dataset and labels as pandas dataframes. Returned as a tuple (x_train, y_train), (x_test, y_test)
"""
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data'
data_dir = '../datasets/nursery'

View file

@ -24,41 +24,6 @@ OUTPUT_DATA_ARRAY_TYPE = np.ndarray
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
"""
converts from INPUT_DATA_ARRAY_TYPE to numpy array
"""
if type(arr) == np.ndarray:
return arr
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
self.is_pandas = True
return arr.to_numpy()
if isinstance(arr, list):
return np.array(arr)
if type(arr) == Tensor:
return arr.detach().cpu().numpy()
raise ValueError('Non supported type: ', type(arr).__name__)
def array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
"""
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
"""
if type(arr) == np.ndarray:
return torch.from_numpy(arr)
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
self.is_pandas = True
return torch.from_numpy(arr.to_numpy())
if isinstance(arr, list):
return torch.tensor(arr)
if type(arr) == Tensor:
return arr
raise ValueError('Non supported type: ', type(arr).__name__)
class Dataset(metaclass=ABCMeta):
"""Base Abstract Class for Dataset"""
@ -68,36 +33,99 @@ class Dataset(metaclass=ABCMeta):
@abstractmethod
def get_samples(self) -> Collection[Any]:
"""Return data samples"""
"""
Return data samples
:return: the data samples
"""
pass
@abstractmethod
def get_labels(self) -> Collection[Any]:
"""Return labels"""
"""
Return labels
:return: the labels
"""
pass
def _array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Converts from INPUT_DATA_ARRAY_TYPE to numpy array
:param arr: the array to transform
:type arr: numpy array or pandas DataFrame or list or pytorch Tensor
:return: the array transformed into a numpy array
"""
if type(arr) == np.ndarray:
return arr
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
self.is_pandas = True
return arr.to_numpy()
if isinstance(arr, list):
return np.array(arr)
if type(arr) == Tensor:
return arr.detach().cpu().numpy()
raise ValueError('Non supported type: ', type(arr).__name__)
def _array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
"""
Converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
:param arr: the array to transform
:type arr: numpy array or pandas DataFrame or list or pytorch Tensor
:return: the array transformed into a pytorch Tensor
"""
if type(arr) == np.ndarray:
return torch.from_numpy(arr)
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
self.is_pandas = True
return torch.from_numpy(arr.to_numpy())
if isinstance(arr, list):
return torch.tensor(arr)
if type(arr) == Tensor:
return arr
raise ValueError('Non supported type: ', type(arr).__name__)
class StoredDataset(Dataset):
"""Abstract Class for Storable Dataset"""
"""Abstract Class for a Dataset that can be downloaded from a URL and stored in a file"""
@abstractmethod
def load_from_file(self, path: str):
"""Load dataset from file"""
"""
Load dataset from file
:param path: the path to the file
:type path: string
:return: None
"""
pass
@abstractmethod
def load(self, **kwargs):
"""Load dataset"""
"""
Load dataset
:return: None
"""
pass
@staticmethod
def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None:
def download(url: str, dest_path: str, filename: str, unzip: Optional[bool] = False) -> None:
"""
Download the dataset from URL
:param url: dataset URL, the dataset will be requested from this URL
:type url: string
:param dest_path: local dataset destination path
:type dest_path: string
:param filename: local dataset filename
:param unzip: flag whether or not perform extraction
:type filename: string
:param unzip: flag whether or not perform extraction. Default is False.
:type unzip: boolean, optional
:return: None
"""
file_path = os.path.join(dest_path, filename)
@ -115,12 +143,16 @@ class StoredDataset(Dataset):
StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
@staticmethod
def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
def extract_archive(zip_path: str, dest_path: Optional[str] = None, remove_archive: Optional[bool] = False):
"""
Extract dataset from archived file
:param zip_path: path to archived file
:type zip_path: string
:param dest_path: directory path to uncompress the file to
:param remove_archive: whether remove the archive file after uncompress (default False)
:type dest_path: string, optional
:param remove_archive: whether remove the archive file after uncompress. Default is False.
:type remove_archive: boolean, optional
:return: None
"""
logger.info("Extracting the dataset...")
@ -134,15 +166,23 @@ class StoredDataset(Dataset):
logger.info("Extracted the dataset")
@staticmethod
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None:
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle: Optional[bool] = True,
delimiter: Optional[str] = ",", fmt: Optional[Union[str, list]] = None) -> None:
"""
Split the data and take only a part of it
:param datafile: dataset file path
:type datafile: string
:param dest_datafile: destination path for the partial dataset file
:type dest_datafile: string
:param ratio: part of the dataset to save
:param shuffle: whether to shuffle the data or not (default True)
:param delimiter: dataset delimiter (default ",")
:param fmt: format for the correct data saving
:type ratio: int
:param shuffle: whether to shuffle the data or not. Default is True.
:type shuffle: boolean, optional
:param delimiter: dataset delimiter. Default is ","
:type delimiter: string, optional
:param fmt: format for the correct data saving. As defined by numpy.savetxt(). Default is None.
:type fmt: string or sequence of strings, optional
:return: None
"""
if os.path.isfile(dest_datafile):
@ -162,21 +202,23 @@ class StoredDataset(Dataset):
class ArrayDataset(Dataset):
"""Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
"""
Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)
:param x: collection of data samples
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
:param y: collection of labels
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
:param feature_names: The feature names, in the order that they appear in the data
:type feature_names: list of strings, optional
"""
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
features_names: Optional = None, **kwargs):
"""
ArrayDataset constructor.
:param x: collection of data samples
:param y: collection of labels (optional)
:param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
:param kwargs: dataset parameters
"""
features_names: Optional[list] = None, **kwargs):
self.is_pandas = False
self.features_names = features_names
self._y = array2numpy(self, y) if y is not None else None
self._x = array2numpy(self, x)
self._y = self._array2numpy(y) if y is not None else None
self._x = self._array2numpy(x)
if self.is_pandas:
if features_names and not np.array_equal(features_names, x.columns):
raise ValueError("The supplied features are not the same as in the data features")
@ -186,51 +228,80 @@ class ArrayDataset(Dataset):
raise ValueError('Non equivalent lengths of x and y')
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return data samples as numpy array"""
"""
Get data samples
:return: data samples as numpy array
"""
return self._x
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return labels as numpy array"""
"""
Get labels
:return: labels as numpy array
"""
return self._y
class PytorchData(Dataset):
"""
Dataset for pytorch models.
:param x: collection of data samples
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
:param y: collection of labels
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
"""
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
"""
PytorchData constructor.
:param x: collection of data samples
:param y: collection of labels (optional)
:param kwargs: dataset parameters
"""
self.is_pandas = False
self._y = array2torch_tensor(self, y) if y is not None else None
self._x = array2torch_tensor(self, x)
self._y = self._array2torch_tensor(y) if y is not None else None
self._x = self._array2torch_tensor(x)
if self.is_pandas:
self.features_names = x.columns
if y is not None and len(self._x) != len(self._y):
raise ValueError('Non equivalent lengths of x and y')
if self._y is not None:
self.__getitem__ = self.get_item
else:
self.__getitem__ = self.get_sample_item
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return data samples as numpy array"""
return array2numpy(self._x)
"""
Get data samples.
:return: samples as numpy array
"""
return self._array2numpy(self._x)
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
"""Return labels as numpy array"""
return array2numpy(self._y) if self._y is not None else None
"""
Get labels.
def get_sample_item(self, idx) -> Tensor:
:return: labels as numpy array
"""
return self._array2numpy(self._y) if self._y is not None else None
def get_sample_item(self, idx: int) -> Tensor:
"""
Get the sample according to the given index
:param idx: the index of the sample to return
:type idx: int
:return: the sample as a pytorch Tensor
"""
return self.x[idx]
def get_item(self, idx) -> Tensor:
def get_item(self, idx: int) -> Tensor:
"""
Get the sample and label according to the given index
:param idx: the index of the sample to return
:type idx: int
:return: the sample and label as pytorch Tensors. Returned as a tuple (sample, label)
"""
sample, label = self.x[idx], self.y[idx]
return sample, label
@ -246,8 +317,10 @@ class DatasetFactory:
def register(cls, name: str) -> Callable:
"""
Class method to register Dataset to the internal registry
:param name: dataset name
:return:
:type name: string
:return: a Callable that returns the registered dataset class
"""
def inner_wrapper(wrapped_class: Dataset) -> Any:
@ -262,11 +335,15 @@ class DatasetFactory:
def create_dataset(cls, name: str, **kwargs) -> Dataset:
"""
Factory command to create dataset instance.
This method gets the appropriate Dataset class from the registry
and creates an instance of it, while passing in the parameters
given in ``kwargs``.
:param name: The name of the dataset to create.
:type name: string
:param kwargs: dataset parameters
:type kwargs: keyword arguments as expected by the class
:return: An instance of the dataset that is created.
"""
if name not in cls.registry:
@ -280,13 +357,19 @@ class DatasetFactory:
class Data:
def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
"""
Class for storing train and test datasets.
:param train: the training set
:type train: `Dataset`
:param test: the test set
:type test: `Dataset`, optional
"""
def __init__(self, train: Dataset = None, test: Optional[Dataset] = None, **kwargs):
"""
Data class constructor.
The class stores train and test datasets.
If neither of the datasets was provided,
Both train and test datasets will be create using
DatasetFactory to create a dataset instance
If neither of the datasets was provided, both train and test datasets will be created using `DatasetFactory`.
"""
if train or test:
self.train = train
@ -296,25 +379,49 @@ class Data:
self.test = DatasetFactory.create_dataset(train=False, **kwargs)
def get_train_set(self) -> Dataset:
"""Return train DatasetBase"""
"""
Get training set
:return: training 'Dataset`
"""
return self.train
def get_test_set(self) -> Dataset:
"""Return test DatasetBase"""
"""
Get test set
:return: test 'Dataset`
"""
return self.test
def get_train_samples(self) -> Collection[Any]:
"""Return train set samples"""
"""
Get train set samples
:return: training samples
"""
return self.train.get_samples()
def get_train_labels(self) -> Collection[Any]:
"""Return train set labels"""
"""
Get train set labels
:return: training labels
"""
return self.train.get_labels()
def get_test_samples(self) -> Collection[Any]:
"""Return test set samples"""
"""
Get test set samples
:return: test samples
"""
return self.test.get_samples()
def get_test_labels(self) -> Collection[Any]:
"""Return test set labels"""
"""
Get test set labels
:return: test labels
"""
return self.test.get_labels()

View file

@ -14,24 +14,25 @@ class ModelOutputType(Enum):
class Model(metaclass=ABCMeta):
"""
Abstract base class for ML model wrappers.
:param model: The original model object (of the underlying ML framework)
:type model: framework-specific model object
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `Model` wrapper object.
:param model: The original model object (of the underlying ML framework)
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
self._model = model
self._output_type = output_type
self._black_box_access = black_box_access
@ -54,7 +55,7 @@ class Model(metaclass=ABCMeta):
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:return: Predictions from the model.
:return: Predictions from the model as numpy array.
"""
raise NotImplementedError
@ -65,13 +66,14 @@ class Model(metaclass=ABCMeta):
:param test_data: Test data.
:type train_data: `Dataset`
:return: the score as float (for classifiers, between 0 and 1)
"""
return NotImplementedError
@property
def model(self) -> Any:
"""
Return the model.
Return the underlying model.
:return: The model.
"""
@ -89,21 +91,19 @@ class Model(metaclass=ABCMeta):
@property
def black_box_access(self) -> bool:
"""
Return True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals are also available.
Return whether the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, or if the model internals are also available.
:return: True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals are also available.
:return: True if the model is only available via query (API) access, otherwise False.
"""
return self._black_box_access
@property
def unlimited_queries(self) -> bool:
"""
If black_box_access is True, Return whether a user can perform unlimited queries to the model API
If black_box_access is True, return whether a user can perform unlimited queries to the model API
or whether there is a limit to the number of queries that can be submitted.
:return: If black_box_access is True, Return whether a user can perform unlimited queries to the model API
or whether there is a limit to the number of queries that can be submitted.
:return: True if a user can perform unlimited queries to the model API, otherwise False.
"""
return self._unlimited_queries

View file

@ -22,6 +22,7 @@ class SklearnModel(Model):
:param test_data: Test data.
:type train_data: `Dataset`
:return: the score as float (for classifiers, between 0 and 1)
"""
return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs)
@ -29,23 +30,23 @@ class SklearnModel(Model):
class SklearnClassifier(SklearnModel):
"""
Wrapper class for scikitlearn classification models.
:param model: The original sklearn model object.
:type model: scikitlearn classifier object
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `SklearnClassifier` wrapper object.
:param model: The original sklearn model object.
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
self._art_model = ArtSklearnClassifier(model)
@ -55,6 +56,7 @@ class SklearnClassifier(SklearnModel):
:param train_data: Training data.
:type train_data: `Dataset`
:return: None
"""
encoder = OneHotEncoder(sparse=False)
y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
@ -65,8 +67,8 @@ class SklearnClassifier(SklearnModel):
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:return: Predictions from the model (class probabilities, if supported).
:type x: `Dataset`
:return: Predictions from the model as numpy array (class probabilities, if supported).
"""
return self._art_model.predict(x, **kwargs)
@ -74,21 +76,21 @@ class SklearnClassifier(SklearnModel):
class SklearnRegressor(SklearnModel):
"""
Wrapper class for scikitlearn regression models.
:param model: The original sklearn model object.
:type model: scikitlearn regressor object
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: BaseEstimator, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
"""
Initialize a `SklearnRegressor` wrapper object.
:param model: The original sklearn model object.
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Optional, Default is True.
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Optional, Default is True.
"""
super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
self._art_model = ScikitlearnRegressor(model)
@ -98,6 +100,7 @@ class SklearnRegressor(SklearnModel):
:param train_data: Training data.
:type train_data: `Dataset`
:return: None
"""
self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
@ -106,7 +109,7 @@ class SklearnRegressor(SklearnModel):
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:return: Predictions from the model.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
return self._art_model.predict(x, **kwargs)

View file

@ -32,8 +32,11 @@ master_doc = 'index'
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.duration',
'sphinx.ext.doctest',
'sphinx.ext.autodoc',
'sphinx.ext.napoleon'
'sphinx.ext.autosummary',
'sphinx.ext.intersphinx',
]
# Add any paths that contain templates here, relative to this directory.
@ -50,7 +53,7 @@ exclude_patterns = []
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'
html_theme = 'pyramid'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,

View file

@ -0,0 +1,22 @@
apt.utils.datasets package
==========================
Submodules
----------
apt.utils.datasets.datasets module
----------------------------------
.. automodule:: apt.utils.datasets.datasets
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: apt.utils.datasets
:members:
:undoc-members:
:show-inheritance:

View file

@ -0,0 +1,30 @@
apt.utils.models package
========================
Submodules
----------
apt.utils.models.model module
-----------------------------
.. automodule:: apt.utils.models.model
:members:
:undoc-members:
:show-inheritance:
apt.utils.models.sklearn\_model module
--------------------------------------
.. automodule:: apt.utils.models.sklearn_model
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: apt.utils.models
:members:
:undoc-members:
:show-inheritance:

31
docs/source/apt.utils.rst Normal file
View file

@ -0,0 +1,31 @@
apt.utils package
=================
Subpackages
-----------
.. toctree::
:maxdepth: 4
apt.utils.datasets
apt.utils.models
Submodules
----------
apt.utils.dataset\_utils module
-------------------------------
.. automodule:: apt.utils.dataset_utils
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: apt.utils
:members:
:undoc-members:
:show-inheritance:

View file

@ -2,6 +2,7 @@ numpy==1.21.0
pandas==1.1.0
scipy==1.4.1
scikit-learn==0.22.2
torch>=1.8.0
adversarial-robustness-toolbox>=1.10.1
# testing