Documentation updates (#29)

* Bump version to 0.1.0 (breaking changes to some APIs) * Update documentation * Update requirements * gitignore
2026-06-29 15:59:38 +02:00 · 2022-05-02 11:46:18 +03:00 · 2022-05-02 11:46:18 +03:00 · fd6be8e778
commit fd6be8e778
parent 014aed9670
12 changed files with 640 additions and 298 deletions
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -19,27 +19,25 @@ class Anonymize:

    Based on the implementation described in: https://arxiv.org/abs/2007.13086

-    Parameters
-    ----------
-    k : int
-        The privacy parameter that determines the number of records that will be indistinguishable from each
-        other (when looking at the quasi identifiers). Should be at least 2.
-    quasi_identifiers : np.ndarray or list
-        The features that need to be minimized in case of pandas data, and indexes of features
-        in case of numpy data.
-    categorical_features : list, optional
-        The list of categorical features (should only be supplied when passing data as a
-        pandas dataframe.
-    is_regression : Bool, optional
-        Whether the model is a regression model or not (if False, assumes
-        a classification model). Default is False.
-    train_only_QI : Bool, optional
-        The required method to train data set for anonymization. Default is
-        to train the tree on all features.
+    :param k: The privacy parameter that determines the number of records that will be indistinguishable from each
+              other (when looking at the quasi identifiers). Should be at least 2.
+    :type k: int
+    :param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
+                              in case of numpy data.
+    :type quasi_identifiers: np.ndarray or list
+    :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
+                                 before using them to train the decision tree model).
+    :type categorical_features: list, optional
+    :param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
+                          Default is False.
+    :type is_regression: list, optional
+    :param train_only_QI: The required method to train data set for anonymization. Default is
+                          to train the tree on all features.
+    :type train_only_QI: boolean, optional
    """

    def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
-                 is_regression=False, train_only_QI=False):
+                 is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
        if k < 2:
            raise ValueError("k should be a positive integer with a value of 2 or higher")
        if quasi_identifiers is None or len(quasi_identifiers) < 1:
@ -59,7 +57,9 @@ class Anonymize:

        :param dataset: Data wrapper containing the training data for the model and the predictions of the
                        original model on the training data.
-        :return: An array containing the anonymized training dataset.
+        :type dataset: `ArrayDataset`
+        :return: The anonymized training dataset as either numpy array or pandas DataFrame (depending on the type of
+                 the original data used to create the ArrayDataset).
        """
        if dataset.get_samples().shape[1] != 0:
            self.features = [i for i in range(dataset.get_samples().shape[1])]