mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-27 05:46:22 +02:00
Merge with main
This commit is contained in:
commit
dc5cc793ee
30 changed files with 2819 additions and 1066 deletions
|
|
@ -5,7 +5,7 @@ Implementation of utility classes for dataset handling
|
|||
"""
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from typing import Callable, Collection, Any, Union, List, Optional
|
||||
from typing import Callable, Collection, Any, Union, List, Optional, Type
|
||||
|
||||
import tarfile
|
||||
import os
|
||||
|
|
@ -66,36 +66,68 @@ class Dataset(metaclass=ABCMeta):
|
|||
|
||||
@abstractmethod
|
||||
def get_samples(self) -> Collection[Any]:
|
||||
"""Return data samples"""
|
||||
pass
|
||||
"""
|
||||
Return data samples
|
||||
|
||||
:return: the data samples
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_labels(self) -> Collection[Any]:
|
||||
"""Return labels"""
|
||||
pass
|
||||
"""
|
||||
Return labels
|
||||
|
||||
:return: the labels
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get predictions
|
||||
|
||||
:return: predictions as numpy array
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class StoredDataset(Dataset):
|
||||
"""Abstract Class for Storable Dataset"""
|
||||
"""Abstract Class for a Dataset that can be downloaded from a URL and stored in a file"""
|
||||
|
||||
@abstractmethod
|
||||
def load_from_file(self, path: str):
|
||||
"""Load dataset from file"""
|
||||
pass
|
||||
"""
|
||||
Load dataset from file
|
||||
|
||||
:param path: the path to the file
|
||||
:type path: string
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def load(self, **kwargs):
|
||||
"""Load dataset"""
|
||||
pass
|
||||
"""
|
||||
Load dataset
|
||||
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None:
|
||||
def download(url: str, dest_path: str, filename: str, unzip: Optional[bool] = False) -> None:
|
||||
"""
|
||||
Download the dataset from URL
|
||||
|
||||
:param url: dataset URL, the dataset will be requested from this URL
|
||||
:type url: string
|
||||
:param dest_path: local dataset destination path
|
||||
:type dest_path: string
|
||||
:param filename: local dataset filename
|
||||
:param unzip: flag whether or not perform extraction
|
||||
:type filename: string
|
||||
:param unzip: flag whether or not perform extraction. Default is False.
|
||||
:type unzip: boolean, optional
|
||||
:return: None
|
||||
"""
|
||||
file_path = os.path.join(dest_path, filename)
|
||||
|
|
@ -113,12 +145,16 @@ class StoredDataset(Dataset):
|
|||
StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
|
||||
|
||||
@staticmethod
|
||||
def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
|
||||
def extract_archive(zip_path: str, dest_path: Optional[str] = None, remove_archive: Optional[bool] = False):
|
||||
"""
|
||||
Extract dataset from archived file
|
||||
|
||||
:param zip_path: path to archived file
|
||||
:type zip_path: string
|
||||
:param dest_path: directory path to uncompress the file to
|
||||
:param remove_archive: whether remove the archive file after uncompress (default False)
|
||||
:type dest_path: string, optional
|
||||
:param remove_archive: whether remove the archive file after uncompress. Default is False.
|
||||
:type remove_archive: boolean, optional
|
||||
:return: None
|
||||
"""
|
||||
logger.info("Extracting the dataset...")
|
||||
|
|
@ -132,15 +168,23 @@ class StoredDataset(Dataset):
|
|||
logger.info("Extracted the dataset")
|
||||
|
||||
@staticmethod
|
||||
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None:
|
||||
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle: Optional[bool] = True,
|
||||
delimiter: Optional[str] = ",", fmt: Optional[Union[str, list]] = None) -> None:
|
||||
"""
|
||||
Split the data and take only a part of it
|
||||
|
||||
:param datafile: dataset file path
|
||||
:type datafile: string
|
||||
:param dest_datafile: destination path for the partial dataset file
|
||||
:type dest_datafile: string
|
||||
:param ratio: part of the dataset to save
|
||||
:param shuffle: whether to shuffle the data or not (default True)
|
||||
:param delimiter: dataset delimiter (default ",")
|
||||
:param fmt: format for the correct data saving
|
||||
:type ratio: int
|
||||
:param shuffle: whether to shuffle the data or not. Default is True.
|
||||
:type shuffle: boolean, optional
|
||||
:param delimiter: dataset delimiter. Default is ","
|
||||
:type delimiter: string, optional
|
||||
:param fmt: format for the correct data saving. As defined by numpy.savetxt(). Default is None.
|
||||
:type fmt: string or sequence of strings, optional
|
||||
:return: None
|
||||
"""
|
||||
if os.path.isfile(dest_datafile):
|
||||
|
|
@ -160,22 +204,19 @@ class StoredDataset(Dataset):
|
|||
|
||||
|
||||
class ArrayDataset(Dataset):
|
||||
"""Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
|
||||
"""
|
||||
Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
x: INPUT_DATA_ARRAY_TYPE,
|
||||
y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
|
||||
features_names: Optional = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
ArrayDataset constructor.
|
||||
:param x: collection of data samples
|
||||
:param y: collection of labels (optional)
|
||||
:param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
|
||||
:param kwargs: dataset parameters
|
||||
"""
|
||||
:param x: collection of data samples
|
||||
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
|
||||
:param y: collection of labels
|
||||
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
|
||||
:param feature_names: The feature names, in the order that they appear in the data
|
||||
:type feature_names: list of strings, optional
|
||||
"""
|
||||
|
||||
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
|
||||
features_names: Optional[list] = None, **kwargs):
|
||||
self.is_pandas = self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series
|
||||
|
||||
self.features_names = features_names
|
||||
|
|
@ -187,26 +228,100 @@ class ArrayDataset(Dataset):
|
|||
raise ValueError("The supplied features are not the same as in the data features")
|
||||
self.features_names = x.columns.to_list()
|
||||
|
||||
if y is not None and len(self._x) != len(self._y):
|
||||
if self._y is not None and len(self._x) != len(self._y):
|
||||
raise ValueError("Non equivalent lengths of x and y")
|
||||
|
||||
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return data samples as numpy array"""
|
||||
"""
|
||||
Get data samples
|
||||
|
||||
:return: data samples as numpy array
|
||||
"""
|
||||
return self._x
|
||||
|
||||
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return labels as numpy array"""
|
||||
"""
|
||||
Get labels
|
||||
|
||||
:return: labels as numpy array
|
||||
"""
|
||||
return self._y
|
||||
|
||||
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get predictions
|
||||
|
||||
:return: predictions as numpy array
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
class DatasetWithPredictions(Dataset):
|
||||
"""
|
||||
Dataset that is based on arrays (e.g., numpy/pandas/list...). Includes predictions from a model, and possibly also
|
||||
features and true labels.
|
||||
|
||||
:param x: collection of data samples
|
||||
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
|
||||
:param y: collection of labels
|
||||
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
|
||||
:param feature_names: The feature names, in the order that they appear in the data
|
||||
:type feature_names: list of strings, optional
|
||||
"""
|
||||
|
||||
def __init__(self, pred: INPUT_DATA_ARRAY_TYPE, x: Optional[INPUT_DATA_ARRAY_TYPE] = None,
|
||||
y: Optional[INPUT_DATA_ARRAY_TYPE] = None, features_names: Optional[list] = None, **kwargs):
|
||||
self.is_pandas = False
|
||||
self.features_names = features_names
|
||||
self._pred = self._array2numpy(pred)
|
||||
self._y = self._array2numpy(y) if y is not None else None
|
||||
self._x = self._array2numpy(x) if x is not None else None
|
||||
if self.is_pandas and x is not None:
|
||||
if features_names and not np.array_equal(features_names, x.columns):
|
||||
raise ValueError("The supplied features are not the same as in the data features")
|
||||
self.features_names = x.columns.to_list()
|
||||
|
||||
if self._y is not None and len(self._pred) != len(self._y):
|
||||
raise ValueError('Non equivalent lengths of pred and y')
|
||||
|
||||
if self._x is not None and len(self._x) != len(self._pred):
|
||||
raise ValueError('Non equivalent lengths of x and pred')
|
||||
|
||||
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get data samples
|
||||
|
||||
:return: data samples as numpy array
|
||||
"""
|
||||
return self._x
|
||||
|
||||
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get labels
|
||||
|
||||
:return: labels as numpy array
|
||||
"""
|
||||
return self._y
|
||||
|
||||
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get predictions
|
||||
|
||||
:return: predictions as numpy array
|
||||
"""
|
||||
return self._pred
|
||||
|
||||
|
||||
class PytorchData(Dataset):
|
||||
"""
|
||||
Dataset for pytorch models.
|
||||
|
||||
:param x: collection of data samples
|
||||
:type x: numpy array or pandas DataFrame or list or pytorch Tensor
|
||||
:param y: collection of labels
|
||||
:type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
|
||||
"""
|
||||
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
|
||||
"""
|
||||
PytorchData constructor.
|
||||
:param x: collection of data samples
|
||||
:param y: collection of labels (optional)
|
||||
:param kwargs: dataset parameters
|
||||
"""
|
||||
self._y = array2torch_tensor(y) if y is not None else None
|
||||
self._x = array2torch_tensor(x)
|
||||
|
||||
|
|
@ -215,7 +330,7 @@ class PytorchData(Dataset):
|
|||
if self.is_pandas:
|
||||
self.features_names = x.columns
|
||||
|
||||
if y is not None and len(self._x) != len(self._y):
|
||||
if self._y is not None and len(self._x) != len(self._y):
|
||||
raise ValueError("Non equivalent lengths of x and y")
|
||||
|
||||
if self._y is not None:
|
||||
|
|
@ -224,17 +339,47 @@ class PytorchData(Dataset):
|
|||
self.__getitem__ = self.get_sample_item
|
||||
|
||||
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return data samples as numpy array"""
|
||||
"""
|
||||
Get data samples.
|
||||
|
||||
:return: samples as numpy array
|
||||
"""
|
||||
return array2numpy(self._x)
|
||||
|
||||
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return labels as numpy array"""
|
||||
"""
|
||||
Get labels.
|
||||
|
||||
:return: labels as numpy array
|
||||
"""
|
||||
return array2numpy(self._y) if self._y is not None else None
|
||||
|
||||
def get_sample_item(self, idx) -> Tensor:
|
||||
def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Get predictions
|
||||
|
||||
:return: predictions as numpy array
|
||||
"""
|
||||
return None
|
||||
|
||||
def get_sample_item(self, idx: int) -> Tensor:
|
||||
"""
|
||||
Get the sample according to the given index
|
||||
|
||||
:param idx: the index of the sample to return
|
||||
:type idx: int
|
||||
:return: the sample as a pytorch Tensor
|
||||
"""
|
||||
return self._x[idx]
|
||||
|
||||
def get_item(self, idx) -> Tensor:
|
||||
def get_item(self, idx: int) -> Tensor:
|
||||
"""
|
||||
Get the sample and label according to the given index
|
||||
|
||||
:param idx: the index of the sample to return
|
||||
:type idx: int
|
||||
:return: the sample and label as pytorch Tensors. Returned as a tuple (sample, label)
|
||||
"""
|
||||
sample, label = self._x[idx], self._y[idx]
|
||||
return sample, label
|
||||
|
||||
|
|
@ -251,11 +396,13 @@ class DatasetFactory:
|
|||
def register(cls, name: str) -> Callable:
|
||||
"""
|
||||
Class method to register Dataset to the internal registry
|
||||
|
||||
:param name: dataset name
|
||||
:return:
|
||||
:type name: string
|
||||
:return: a Callable that returns the registered dataset class
|
||||
"""
|
||||
|
||||
def inner_wrapper(wrapped_class: Dataset) -> Any:
|
||||
def inner_wrapper(wrapped_class: Type[Dataset]) -> Any:
|
||||
if name in cls.registry:
|
||||
logger.warning("Dataset %s already exists. Will replace it", name)
|
||||
cls.registry[name] = wrapped_class
|
||||
|
|
@ -267,11 +414,15 @@ class DatasetFactory:
|
|||
def create_dataset(cls, name: str, **kwargs) -> Dataset:
|
||||
"""
|
||||
Factory command to create dataset instance.
|
||||
|
||||
This method gets the appropriate Dataset class from the registry
|
||||
and creates an instance of it, while passing in the parameters
|
||||
given in ``kwargs``.
|
||||
|
||||
:param name: The name of the dataset to create.
|
||||
:type name: string
|
||||
:param kwargs: dataset parameters
|
||||
:type kwargs: keyword arguments as expected by the class
|
||||
:return: An instance of the dataset that is created.
|
||||
"""
|
||||
if name not in cls.registry:
|
||||
|
|
@ -285,13 +436,19 @@ class DatasetFactory:
|
|||
|
||||
|
||||
class Data:
|
||||
def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
|
||||
"""
|
||||
Class for storing train and test datasets.
|
||||
|
||||
:param train: the training set
|
||||
:type train: `Dataset`
|
||||
:param test: the test set
|
||||
:type test: `Dataset`, optional
|
||||
"""
|
||||
def __init__(self, train: Dataset = None, test: Optional[Dataset] = None, **kwargs):
|
||||
"""
|
||||
Data class constructor.
|
||||
The class stores train and test datasets.
|
||||
If neither of the datasets was provided,
|
||||
Both train and test datasets will be create using
|
||||
DatasetFactory to create a dataset instance
|
||||
|
||||
If neither of the datasets was provided, both train and test datasets will be created using `DatasetFactory`.
|
||||
"""
|
||||
if train or test:
|
||||
self.train = train
|
||||
|
|
@ -301,25 +458,77 @@ class Data:
|
|||
self.test = DatasetFactory.create_dataset(train=False, **kwargs)
|
||||
|
||||
def get_train_set(self) -> Dataset:
|
||||
"""Return train DatasetBase"""
|
||||
"""
|
||||
Get training set
|
||||
|
||||
:return: training 'Dataset`
|
||||
"""
|
||||
return self.train
|
||||
|
||||
def get_test_set(self) -> Dataset:
|
||||
"""Return test DatasetBase"""
|
||||
"""
|
||||
Get test set
|
||||
|
||||
:return: test 'Dataset`
|
||||
"""
|
||||
return self.test
|
||||
|
||||
def get_train_samples(self) -> Collection[Any]:
|
||||
"""Return train set samples"""
|
||||
"""
|
||||
Get train set samples, or None if no training data provided
|
||||
|
||||
:return: training samples
|
||||
"""
|
||||
if self.train is None:
|
||||
return None
|
||||
return self.train.get_samples()
|
||||
|
||||
def get_train_labels(self) -> Collection[Any]:
|
||||
"""Return train set labels"""
|
||||
"""
|
||||
Get train set labels, or None if no training labels provided
|
||||
|
||||
:return: training labels
|
||||
"""
|
||||
if self.train is None:
|
||||
return None
|
||||
return self.train.get_labels()
|
||||
|
||||
def get_train_predictions(self) -> Collection[Any]:
|
||||
"""
|
||||
Get train set predictions, or None if no training predictions provided
|
||||
|
||||
:return: training labels
|
||||
"""
|
||||
if self.train is None:
|
||||
return None
|
||||
return self.train.get_predictions()
|
||||
|
||||
def get_test_samples(self) -> Collection[Any]:
|
||||
"""Return test set samples"""
|
||||
"""
|
||||
Get test set samples
|
||||
|
||||
:return: test samples, or None if no test data provided
|
||||
"""
|
||||
if self.test is None:
|
||||
return None
|
||||
return self.test.get_samples()
|
||||
|
||||
def get_test_labels(self) -> Collection[Any]:
|
||||
"""Return test set labels"""
|
||||
"""
|
||||
Get test set labels
|
||||
|
||||
:return: test labels, or None if no test labels provided
|
||||
"""
|
||||
if self.test is None:
|
||||
return None
|
||||
return self.test.get_labels()
|
||||
|
||||
def get_test_predictions(self) -> Collection[Any]:
|
||||
"""
|
||||
Get test set predictions, or None if no test predictions provided
|
||||
|
||||
:return: test labels
|
||||
"""
|
||||
if self.test is None:
|
||||
return None
|
||||
return self.test.get_predictions()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue