Documentation updates (#29)

* Bump version to 0.1.0 (breaking changes to some APIs) * Update documentation * Update requirements * gitignore
2026-04-27 13:56:22 +02:00 · 2022-05-02 11:46:18 +03:00 · 2022-05-02 11:46:18 +03:00 · fd6be8e778
commit fd6be8e778
parent 014aed9670
12 changed files with 640 additions and 298 deletions
--- a/apt/utils/datasets/datasets.py
+++ b/apt/utils/datasets/datasets.py
@ -24,41 +24,6 @@ OUTPUT_DATA_ARRAY_TYPE = np.ndarray
 DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]


-def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
-
-    """
-    converts from INPUT_DATA_ARRAY_TYPE to numpy array
-    """
-    if type(arr) == np.ndarray:
-        return arr
-    if type(arr) == pd.DataFrame or type(arr) == pd.Series:
-        self.is_pandas = True
-        return arr.to_numpy()
-    if isinstance(arr, list):
-        return np.array(arr)
-    if type(arr) == Tensor:
-        return arr.detach().cpu().numpy()
-
-    raise ValueError('Non supported type: ', type(arr).__name__)
-
-
-def array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
-    """
-    converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
-    """
-    if type(arr) == np.ndarray:
-        return torch.from_numpy(arr)
-    if type(arr) == pd.DataFrame or type(arr) == pd.Series:
-        self.is_pandas = True
-        return torch.from_numpy(arr.to_numpy())
-    if isinstance(arr, list):
-        return torch.tensor(arr)
-    if type(arr) == Tensor:
-        return arr
-
-    raise ValueError('Non supported type: ', type(arr).__name__)
-
-
 class Dataset(metaclass=ABCMeta):
    """Base Abstract Class for Dataset"""

@ -68,36 +33,99 @@ class Dataset(metaclass=ABCMeta):

    @abstractmethod
    def get_samples(self) -> Collection[Any]:
-        """Return data samples"""
+        """
+        Return data samples
+
+        :return: the data samples
+        """
        pass

    @abstractmethod
    def get_labels(self) -> Collection[Any]:
-        """Return labels"""
+        """
+        Return labels
+
+        :return: the labels
+        """
        pass

+    def _array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
+        """
+        Converts from INPUT_DATA_ARRAY_TYPE to numpy array
+
+        :param arr: the array to transform
+        :type arr: numpy array or pandas DataFrame or list or pytorch Tensor
+        :return: the array transformed into a numpy array
+        """
+        if type(arr) == np.ndarray:
+            return arr
+        if type(arr) == pd.DataFrame or type(arr) == pd.Series:
+            self.is_pandas = True
+            return arr.to_numpy()
+        if isinstance(arr, list):
+            return np.array(arr)
+        if type(arr) == Tensor:
+            return arr.detach().cpu().numpy()
+
+        raise ValueError('Non supported type: ', type(arr).__name__)
+
+    def _array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
+        """
+        Converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
+
+        :param arr: the array to transform
+        :type arr: numpy array or pandas DataFrame or list or pytorch Tensor
+        :return: the array transformed into a pytorch Tensor
+        """
+        if type(arr) == np.ndarray:
+            return torch.from_numpy(arr)
+        if type(arr) == pd.DataFrame or type(arr) == pd.Series:
+            self.is_pandas = True
+            return torch.from_numpy(arr.to_numpy())
+        if isinstance(arr, list):
+            return torch.tensor(arr)
+        if type(arr) == Tensor:
+            return arr
+
+        raise ValueError('Non supported type: ', type(arr).__name__)
+

 class StoredDataset(Dataset):
-    """Abstract Class for Storable Dataset"""
+    """Abstract Class for a Dataset that can be downloaded from a URL and stored in a file"""

    @abstractmethod
    def load_from_file(self, path: str):
-        """Load dataset from file"""
+        """
+        Load dataset from file
+
+        :param path: the path to the file
+        :type path: string
+        :return: None
+        """
        pass

    @abstractmethod
    def load(self, **kwargs):
-        """Load dataset"""
+        """
+        Load dataset
+
+        :return: None
+        """
        pass

    @staticmethod
-    def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None:
+    def download(url: str, dest_path: str, filename: str, unzip: Optional[bool] = False) -> None:
        """
        Download the dataset from URL
+
        :param url: dataset URL, the dataset will be requested from this URL
+        :type url: string
        :param dest_path: local dataset destination path
+        :type dest_path: string
        :param filename: local dataset filename
-        :param unzip: flag whether or not perform extraction
+        :type filename: string
+        :param unzip: flag whether or not perform extraction. Default is False.
+        :type unzip: boolean, optional
        :return: None
        """
        file_path = os.path.join(dest_path, filename)
@ -115,12 +143,16 @@ class StoredDataset(Dataset):
            StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)

    @staticmethod
-    def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
+    def extract_archive(zip_path: str, dest_path: Optional[str] = None, remove_archive: Optional[bool] = False):
        """
        Extract dataset from archived file
+
        :param zip_path: path to archived file
+        :type zip_path: string
        :param dest_path: directory path to uncompress the file to
-        :param remove_archive: whether remove the archive file after uncompress (default False)
+        :type dest_path: string, optional
+        :param remove_archive: whether remove the archive file after uncompress. Default is False.
+        :type remove_archive: boolean, optional
        :return: None
        """
        logger.info("Extracting the dataset...")
@ -134,15 +166,23 @@ class StoredDataset(Dataset):
        logger.info("Extracted the dataset")

    @staticmethod
-    def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None:
+    def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle: Optional[bool] = True,
+                    delimiter: Optional[str] = ",", fmt: Optional[Union[str, list]] = None) -> None:
        """
        Split the data and take only a part of it
+
        :param datafile: dataset file path
+        :type datafile: string
        :param dest_datafile: destination path for the partial dataset file
+        :type dest_datafile: string
        :param ratio: part of the dataset to save
-        :param shuffle: whether to shuffle the data or not (default True)
-        :param delimiter: dataset delimiter (default ",")
-        :param fmt: format for the correct data saving
+        :type ratio: int
+        :param shuffle: whether to shuffle the data or not. Default is True.
+        :type shuffle: boolean, optional
+        :param delimiter: dataset delimiter. Default is ","
+        :type delimiter: string, optional
+        :param fmt: format for the correct data saving. As defined by numpy.savetxt(). Default is None.
+        :type fmt: string or sequence of strings, optional
        :return: None
        """
        if os.path.isfile(dest_datafile):
@ -162,21 +202,23 @@ class StoredDataset(Dataset):


 class ArrayDataset(Dataset):
-    """Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
+    """
+    Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)
+
+    :param x: collection of data samples
+    :type x: numpy array or pandas DataFrame or list or pytorch Tensor
+    :param y: collection of labels
+    :type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
+    :param feature_names: The feature names, in the order that they appear in the data
+    :type feature_names: list of strings, optional
+    """

    def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
-                 features_names: Optional = None, **kwargs):
-        """
-        ArrayDataset constructor.
-        :param x: collection of data samples
-        :param y: collection of labels (optional)
-        :param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
-        :param kwargs: dataset parameters
-        """
+                 features_names: Optional[list] = None, **kwargs):
        self.is_pandas = False
        self.features_names = features_names
-        self._y = array2numpy(self, y) if y is not None else None
-        self._x = array2numpy(self, x)
+        self._y = self._array2numpy(y) if y is not None else None
+        self._x = self._array2numpy(x)
        if self.is_pandas:
            if features_names and not np.array_equal(features_names, x.columns):
                raise ValueError("The supplied features are not the same as in the data features")
@ -186,51 +228,80 @@ class ArrayDataset(Dataset):
            raise ValueError('Non equivalent lengths of x and y')

    def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
-        """Return data samples as numpy array"""
+        """
+        Get data samples
+
+        :return: data samples as numpy array
+        """
        return self._x

    def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
-        """Return labels as numpy array"""
+        """
+        Get labels
+
+        :return: labels as numpy array
+        """
        return self._y


 class PytorchData(Dataset):
+    """
+    Dataset for pytorch models.

+    :param x: collection of data samples
+    :type x: numpy array or pandas DataFrame or list or pytorch Tensor
+    :param y: collection of labels
+    :type y: numpy array or pandas DataFrame or list or pytorch Tensor, optional
+    """
    def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
-        """
-        PytorchData constructor.
-        :param x: collection of data samples
-        :param y: collection of labels (optional)
-        :param kwargs: dataset parameters
-        """
        self.is_pandas = False
-        self._y = array2torch_tensor(self, y) if y is not None else None
-        self._x = array2torch_tensor(self, x)
+        self._y = self._array2torch_tensor(y) if y is not None else None
+        self._x = self._array2torch_tensor(x)
        if self.is_pandas:
            self.features_names = x.columns

        if y is not None and len(self._x) != len(self._y):
            raise ValueError('Non equivalent lengths of x and y')

-
        if self._y is not None:
            self.__getitem__ = self.get_item
        else:
            self.__getitem__ = self.get_sample_item

-
    def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
-        """Return data samples as numpy array"""
-        return array2numpy(self._x)
+        """
+        Get data samples.
+
+        :return: samples as numpy array
+        """
+        return self._array2numpy(self._x)

    def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
-        """Return labels as numpy array"""
-        return array2numpy(self._y) if self._y is not None else None
+        """
+        Get labels.

-    def get_sample_item(self, idx) -> Tensor:
+        :return: labels as numpy array
+        """
+        return self._array2numpy(self._y) if self._y is not None else None
+
+    def get_sample_item(self, idx: int) -> Tensor:
+        """
+        Get the sample according to the given index
+
+        :param idx: the index of the sample to return
+        :type idx: int
+        :return: the sample as a pytorch Tensor
+        """
        return self.x[idx]

-    def get_item(self, idx) -> Tensor:
+    def get_item(self, idx: int) -> Tensor:
+        """
+        Get the sample and label according to the given index
+
+        :param idx: the index of the sample to return
+        :type idx: int
+        :return: the sample and label as pytorch Tensors. Returned as a tuple (sample, label)
+        """
        sample, label = self.x[idx], self.y[idx]
        return sample, label

@ -246,8 +317,10 @@ class DatasetFactory:
    def register(cls, name: str) -> Callable:
        """
        Class method to register Dataset to the internal registry
+
        :param name: dataset name
-        :return:
+        :type name: string
+        :return: a Callable that returns the registered dataset class
        """

        def inner_wrapper(wrapped_class: Dataset) -> Any:
@ -262,11 +335,15 @@ class DatasetFactory:
    def create_dataset(cls, name: str, **kwargs) -> Dataset:
        """
        Factory command to create dataset instance.
+
        This method gets the appropriate Dataset class from the registry
        and creates an instance of it, while passing in the parameters
        given in ``kwargs``.
+
        :param name: The name of the dataset to create.
+        :type name: string
        :param kwargs: dataset parameters
+        :type kwargs: keyword arguments as expected by the class
        :return: An instance of the dataset that is created.
        """
        if name not in cls.registry:
@ -280,13 +357,19 @@ class DatasetFactory:


 class Data:
-    def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
+    """
+    Class for storing train and test datasets.
+
+    :param train: the training set
+    :type train: `Dataset`
+    :param test: the test set
+    :type test: `Dataset`, optional
+    """
+    def __init__(self, train: Dataset = None, test: Optional[Dataset] = None, **kwargs):
        """
        Data class constructor.
-        The class stores train and test datasets.
-        If neither of the datasets was provided,
-        Both train and test datasets will be create using
-        DatasetFactory to create a dataset instance
+
+        If neither of the datasets was provided, both train and test datasets will be created using `DatasetFactory`.
        """
        if train or test:
            self.train = train
@ -296,25 +379,49 @@ class Data:
            self.test = DatasetFactory.create_dataset(train=False, **kwargs)

    def get_train_set(self) -> Dataset:
-        """Return train DatasetBase"""
+        """
+        Get training set
+
+        :return: training 'Dataset`
+        """
        return self.train

    def get_test_set(self) -> Dataset:
-        """Return test DatasetBase"""
+        """
+        Get test set
+
+        :return: test 'Dataset`
+        """
        return self.test

    def get_train_samples(self) -> Collection[Any]:
-        """Return train set samples"""
+        """
+        Get train set samples
+
+        :return: training samples
+        """
        return self.train.get_samples()

    def get_train_labels(self) -> Collection[Any]:
-        """Return train set labels"""
+        """
+        Get train set labels
+
+        :return: training labels
+        """
        return self.train.get_labels()

    def get_test_samples(self) -> Collection[Any]:
-        """Return test set samples"""
+        """
+        Get test set samples
+
+        :return: test samples
+        """
        return self.test.get_samples()

    def get_test_labels(self) -> Collection[Any]:
-        """Return test set labels"""
+        """
+        Get test set labels
+
+        :return: test labels
+        """
        return self.test.get_labels()