Update docs

2026-06-08 15:05:13 +02:00 · 2022-02-23 19:40:11 +02:00 · 2022-02-23 19:40:11 +02:00 · c47819a031
commit c47819a031
parent 7e2ce7fe96
3 changed files with 21 additions and 13 deletions
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -14,19 +14,25 @@ class Anonymize:
    Class for performing tailored, model-guided anonymization of training datasets for ML models.

    Based on the implementation described in: https://arxiv.org/abs/2007.13086
+
+    Parameters
+    ----------
+    k : int
+        The privacy parameter that determines the number of records that will be indistinguishable from each
+        other (when looking at the quasi identifiers). Should be at least 2.
+    quasi_identifiers : np.ndarray or list
+        The features that need to be minimized in case of pandas data, and indexes of features
+        in case of numpy data.
+    categorical_features : list, optional
+        The list of categorical features (should only be supplied when passing data as a
+        pandas dataframe.
+    is_regression : Bool, optional
+        Whether the model is a regression model or not (if False, assumes
+        a classification model). Default is False.
    """

    def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
                 is_regression=False):
-        """
-        :param k: The privacy parameter that determines the number of records that will be indistinguishable from each
-                  other (when looking at the quasi identifiers). Should be at least 2.
-        :param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
-                                  in case of numpy data.
-        :param categorical_features: The list of categorical features (should only be supplied when passing data as a
-                                     pandas dataframe.
-        :param is_regression: Boolean param indicates that is is a regression problem.
-        """
        if k < 2:
            raise ValueError("k should be a positive integer with a value of 2 or higher")
        if quasi_identifiers is None or len(quasi_identifiers) < 1:
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -39,12 +39,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    estimator : estimator, optional
        The original model for which generalization is being performed.
        Should be pre-fitted.
-
    target_accuracy : float, optional
        The required accuracy when applying the base model to the
        generalized data. Accuracy is measured relative to the original
        accuracy of the model.
-
    features : list of str, optional
        The feature names, in the order that they appear in the data.
    categorical_features: list of str, optional
@ -63,6 +61,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        The required method to train data set for minimizing. Default is
        to train the tree just on the features that are given as
        features_to_minimize.
+    is_regression : Bool, optional
+        Whether the model is a regression model or not (if False, assumes
+        a classification model). Default is False.

    Attributes
    ----------
--- a/apt/utils.py
+++ b/apt/utils.py
@ -19,20 +19,21 @@ def _load_iris(test_set_size: float = 0.3):
    return (x_train, y_train), (x_test, y_test)


-def get_iris_dataset():
+def get_iris_dataset(test_set: float = 0.3):
    """
    Loads the Iris dataset from scikit-learn.

    :param test_set: Proportion of the data to use as validation split (value between 0 and 1).
    :return: Entire dataset and labels as numpy array.
    """
-    return _load_iris()
+    return _load_iris(test_set)


 def get_german_credit_dataset(test_set: float = 0.3):
    """
    Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.

+    :param test_set: Proportion of the data to use as validation split (value between 0 and 1).
    :return: Dataset and labels as pandas dataframes.
    """