Merge with main

2026-04-27 05:46:22 +02:00 · 2022-08-01 18:11:34 +03:00 · 2022-08-01 18:11:34 +03:00 · dc5cc793ee
commit dc5cc793ee
parent 74ce92acc4 a9e2a35e18
30 changed files with 2819 additions and 1066 deletions
--- a/apt/utils/datasets/datasets.py
+++ b/apt/utils/datasets/datasets.py
@ -5,7 +5,7 @@ Implementation of utility classes for dataset handling
 """

 from abc import ABCMeta, abstractmethod
-from typing import Callable, Collection, Any, Union, List, Optional
+from typing import Callable, Collection, Any, Union, List, Optional, Type

 import tarfile
 import os
@ -66,36 +66,68 @@ class Dataset(metaclass=ABCMeta):

    @abstractmethod
    def get_samples(self) -> Collection[Any]:
-        """Return data samples"""
-        pass
+        """
+        Return data samples
+
+        :return: the data samples
+        """
+        raise NotImplementedError

    @abstractmethod
    def get_labels(self) -> Collection[Any]:
-        """Return labels"""
-        pass
+        """
+        Return labels
+
+        :return: the labels
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
+        """
+        Get predictions
+
+        :return: predictions as numpy array
+        """
+        raise NotImplementedError


 class StoredDataset(Dataset):
-    """Abstract Class for Storable Dataset"""
+    """Abstract Class for a Dataset that can be downloaded from a URL and stored in a file"""

    @abstractmethod
    def load_from_file(self, path: str):
-        """Load dataset from file"""
-        pass
+        """
+        Load dataset from file
+
+        :param path: the path to the file
+        :type path: string
+        :return: None
+        """
+        raise NotImplementedError

    @abstractmethod
    def load(self, **kwargs):
-        """Load dataset"""
-        pass
+        """
+        Load dataset
+
+        :return: None
+        """
+        raise NotImplementedError

    @staticmethod
-    def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None:
+    def download(url: str, dest_path: str, filename: str, unzip: Optional[bool] = False) -> None:
        """
        Download the dataset from URL
+
        :param url: dataset URL, the dataset will be requested from this URL
+        :type url: string
        :param dest_path: local dataset destination path
+        :type dest_path: string
        :param filename: local dataset filename
-        :param unzip: flag whether or not perform extraction
+        :type filename: string
+        :param unzip: flag whether or not perform extraction. Default is False.
+        :type unzip: boolean, optional
        :return: None
        """
        file_path = os.path.join(dest_path, filename)
@ -113,12 +145,16 @@ class StoredDataset(Dataset):
            StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)

    @staticmethod
-    def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
+    def extract_archive(zip_path: str, dest_path: Optional[str] = None, remove_archive: Optional[bool] = False):
        """
        Extract dataset from archived file
+
        :param zip_path: path to archived file
+        :type zip_path: string
        :param dest_path: directory path to uncompress the file to
-        :param remove_archive: whether remove the archive file after uncompress (default False)
+        :type dest_path: string, optional
+        :param remove_archive: whether remove the archive file after uncompress. Default is False.
+        :type remove_archive: boolean, optional
        :return: None
        """
        logger.info("Extracting the dataset...")
@ -132,15 +168,23 @@ class StoredDataset(Dataset):
        logger.info("Extracted the dataset")

    @staticmethod
-    def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None:
+    def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle: Optional[bool] = True,
+                    delimiter: Optional[str] = ",", fmt: Optional[Union[str, list]] = None) -> None:
        """
        Split the data and take only a part of it
+
        :param datafile: dataset file path
+        :type datafile: string
        :param dest_datafile: destination path for the partial dataset file
+        :type dest_datafile: string
        :param ratio: part of the dataset to save
-        :param shuffle: whether to shuffle the data or not (default True)
-        :param delimiter: dataset delimiter (default ",")
-        :param fmt: format for the correct data saving
+        :type ratio: int
+        :param shuffle: whether to shuffle the data or not. Default is True.
+        :type shuffle: boolean, optional
+        :param delimiter: dataset delimiter. Default is ","
+        :type delimiter: string, optional
+        :param fmt: format for the correct data saving. As defined by numpy.savetxt(). Default is None.
+        :type fmt: string or sequence of strings, optional
        :return: None
        """
        if os.path.isfile(dest_datafile):
@ -160,22 +204,19 @@ class StoredDataset(Dataset):


 class ArrayDataset(Dataset):
-    """Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
+    """
+    Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)

-    def __init__(
-        self,
-        x: INPUT_DATA_ARRAY_TYPE,
-        y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
-        features_names: Optional = None,
-        **kwargs,
-    ):
-        """
-        ArrayDataset constructor.
-        :param x: collection of data samples
-        :param y: collection of labels (optional)
-        :param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
-        :param kwargs: dataset parameters
-        """
+    :param x: collection of data samples
+    :type x: numpy array or pandas DataFrame or list or pytorch Tensor
+    :param y: collection of labels
+    :type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
+    :param feature_names: The feature names, in the order that they appear in the data
+    :type feature_names: list of strings, optional
+    """
+
+    def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
+                 features_names: Optional[list] = None, **kwargs):
        self.is_pandas = self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series

        self.features_names = features_names
@ -187,26 +228,100 @@ class ArrayDataset(Dataset):
                raise ValueError("The supplied features are not the same as in the data features")
            self.features_names = x.columns.to_list()

-        if y is not None and len(self._x) != len(self._y):
+        if self._y is not None and len(self._x) != len(self._y):
            raise ValueError("Non equivalent lengths of x and y")

    def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
-        """Return data samples as numpy array"""
+        """
+        Get data samples
+
+        :return: data samples as numpy array
+        """
        return self._x

    def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
-        """Return labels as numpy array"""
+        """
+        Get labels
+
+        :return: labels as numpy array
+        """
        return self._y

+    def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
+        """
+        Get predictions
+
+        :return: predictions as numpy array
+        """
+        return None
+
+
+class DatasetWithPredictions(Dataset):
+    """
+    Dataset that is based on arrays (e.g., numpy/pandas/list...). Includes predictions from a model, and possibly also
+    features and true labels.
+
+    :param x: collection of data samples
+    :type x: numpy array or pandas DataFrame or list or pytorch Tensor
+    :param y: collection of labels
+    :type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
+    :param feature_names: The feature names, in the order that they appear in the data
+    :type feature_names: list of strings, optional
+    """
+
+    def __init__(self, pred: INPUT_DATA_ARRAY_TYPE, x: Optional[INPUT_DATA_ARRAY_TYPE] = None,
+                 y: Optional[INPUT_DATA_ARRAY_TYPE] = None, features_names: Optional[list] = None, **kwargs):
+        self.is_pandas = False
+        self.features_names = features_names
+        self._pred = self._array2numpy(pred)
+        self._y = self._array2numpy(y) if y is not None else None
+        self._x = self._array2numpy(x) if x is not None else None
+        if self.is_pandas and x is not None:
+            if features_names and not np.array_equal(features_names, x.columns):
+                raise ValueError("The supplied features are not the same as in the data features")
+            self.features_names = x.columns.to_list()
+
+        if self._y is not None and len(self._pred) != len(self._y):
+            raise ValueError('Non equivalent lengths of pred and y')
+
+        if self._x is not None and len(self._x) != len(self._pred):
+            raise ValueError('Non equivalent lengths of x and pred')
+
+    def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
+        """
+        Get data samples
+
+        :return: data samples as numpy array
+        """
+        return self._x
+
+    def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
+        """
+        Get labels
+
+        :return: labels as numpy array
+        """
+        return self._y
+
+    def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
+        """
+        Get predictions
+
+        :return: predictions as numpy array
+        """
+        return self._pred
+

 class PytorchData(Dataset):
+    """
+    Dataset for pytorch models.
+
+    :param x: collection of data samples
+    :type x: numpy array or pandas DataFrame or list or pytorch Tensor
+    :param y: collection of labels
+    :type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
+    """
    def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
-        """
-        PytorchData constructor.
-        :param x: collection of data samples
-        :param y: collection of labels (optional)
-        :param kwargs: dataset parameters
-        """
        self._y = array2torch_tensor(y) if y is not None else None
        self._x = array2torch_tensor(x)

@ -215,7 +330,7 @@ class PytorchData(Dataset):
        if self.is_pandas:
            self.features_names = x.columns

-        if y is not None and len(self._x) != len(self._y):
+        if self._y is not None and len(self._x) != len(self._y):
            raise ValueError("Non equivalent lengths of x and y")

        if self._y is not None:
@ -224,17 +339,47 @@ class PytorchData(Dataset):
            self.__getitem__ = self.get_sample_item

    def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
-        """Return data samples as numpy array"""
+        """
+        Get data samples.
+
+        :return: samples as numpy array
+        """
        return array2numpy(self._x)

    def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
-        """Return labels as numpy array"""
+        """
+        Get labels.
+
+        :return: labels as numpy array
+        """
        return array2numpy(self._y) if self._y is not None else None

-    def get_sample_item(self, idx) -> Tensor:
+    def get_predictions(self) -> OUTPUT_DATA_ARRAY_TYPE:
+        """
+        Get predictions
+
+        :return: predictions as numpy array
+        """
+        return None
+
+    def get_sample_item(self, idx: int) -> Tensor:
+        """
+        Get the sample according to the given index
+
+        :param idx: the index of the sample to return
+        :type idx: int
+        :return: the sample as a pytorch Tensor
+        """
        return self._x[idx]

-    def get_item(self, idx) -> Tensor:
+    def get_item(self, idx: int) -> Tensor:
+        """
+        Get the sample and label according to the given index
+
+        :param idx: the index of the sample to return
+        :type idx: int
+        :return: the sample and label as pytorch Tensors. Returned as a tuple (sample, label)
+        """
        sample, label = self._x[idx], self._y[idx]
        return sample, label

@ -251,11 +396,13 @@ class DatasetFactory:
    def register(cls, name: str) -> Callable:
        """
        Class method to register Dataset to the internal registry
+
        :param name: dataset name
-        :return:
+        :type name: string
+        :return: a Callable that returns the registered dataset class
        """

-        def inner_wrapper(wrapped_class: Dataset) -> Any:
+        def inner_wrapper(wrapped_class: Type[Dataset]) -> Any:
            if name in cls.registry:
                logger.warning("Dataset %s already exists. Will replace it", name)
            cls.registry[name] = wrapped_class
@ -267,11 +414,15 @@ class DatasetFactory:
    def create_dataset(cls, name: str, **kwargs) -> Dataset:
        """
        Factory command to create dataset instance.
+
        This method gets the appropriate Dataset class from the registry
        and creates an instance of it, while passing in the parameters
        given in ``kwargs``.
+
        :param name: The name of the dataset to create.
+        :type name: string
        :param kwargs: dataset parameters
+        :type kwargs: keyword arguments as expected by the class
        :return: An instance of the dataset that is created.
        """
        if name not in cls.registry:
@ -285,13 +436,19 @@ class DatasetFactory:


 class Data:
-    def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
+    """
+    Class for storing train and test datasets.
+
+    :param train: the training set
+    :type train: `Dataset`
+    :param test: the test set
+    :type test: `Dataset`, optional
+    """
+    def __init__(self, train: Dataset = None, test: Optional[Dataset] = None, **kwargs):
        """
        Data class constructor.
-        The class stores train and test datasets.
-        If neither of the datasets was provided,
-        Both train and test datasets will be create using
-        DatasetFactory to create a dataset instance
+
+        If neither of the datasets was provided, both train and test datasets will be created using `DatasetFactory`.
        """
        if train or test:
            self.train = train
@ -301,25 +458,77 @@ class Data:
            self.test = DatasetFactory.create_dataset(train=False, **kwargs)

    def get_train_set(self) -> Dataset:
-        """Return train DatasetBase"""
+        """
+        Get training set
+
+        :return: training 'Dataset`
+        """
        return self.train

    def get_test_set(self) -> Dataset:
-        """Return test DatasetBase"""
+        """
+        Get test set
+
+        :return: test 'Dataset`
+        """
        return self.test

    def get_train_samples(self) -> Collection[Any]:
-        """Return train set samples"""
+        """
+        Get train set samples, or None if no training data provided
+
+        :return: training samples
+        """
+        if self.train is None:
+            return None
        return self.train.get_samples()

    def get_train_labels(self) -> Collection[Any]:
-        """Return train set labels"""
+        """
+        Get train set labels, or None if no training labels provided
+
+        :return: training labels
+        """
+        if self.train is None:
+            return None
        return self.train.get_labels()

+    def get_train_predictions(self) -> Collection[Any]:
+        """
+        Get train set predictions, or None if no training predictions provided
+
+        :return: training labels
+        """
+        if self.train is None:
+            return None
+        return self.train.get_predictions()
+
    def get_test_samples(self) -> Collection[Any]:
-        """Return test set samples"""
+        """
+        Get test set samples
+
+        :return: test samples, or None if no test data provided
+        """
+        if self.test is None:
+            return None
        return self.test.get_samples()

    def get_test_labels(self) -> Collection[Any]:
-        """Return test set labels"""
+        """
+        Get test set labels
+
+        :return: test labels, or None if no test labels provided
+        """
+        if self.test is None:
+            return None
        return self.test.get_labels()
+
+    def get_test_predictions(self) -> Collection[Any]:
+        """
+        Get test set predictions, or None if no test predictions provided
+
+        :return: test labels
+        """
+        if self.test is None:
+            return None
+        return self.test.get_predictions()