Update docs

This commit is contained in:
abigailt 2022-02-23 19:40:11 +02:00
parent 7e2ce7fe96
commit c47819a031
3 changed files with 21 additions and 13 deletions

View file

@ -14,19 +14,25 @@ class Anonymize:
Class for performing tailored, model-guided anonymization of training datasets for ML models.
Based on the implementation described in: https://arxiv.org/abs/2007.13086
Parameters
----------
k : int
The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
quasi_identifiers : np.ndarray or list
The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
categorical_features : list, optional
The list of categorical features (should only be supplied when passing data as a
pandas dataframe.
is_regression : Bool, optional
Whether the model is a regression model or not (if False, assumes
a classification model). Default is False.
"""
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
is_regression=False):
"""
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
:param categorical_features: The list of categorical features (should only be supplied when passing data as a
pandas dataframe.
:param is_regression: Boolean param indicates that is is a regression problem.
"""
if k < 2:
raise ValueError("k should be a positive integer with a value of 2 or higher")
if quasi_identifiers is None or len(quasi_identifiers) < 1:

View file

@ -39,12 +39,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
estimator : estimator, optional
The original model for which generalization is being performed.
Should be pre-fitted.
target_accuracy : float, optional
The required accuracy when applying the base model to the
generalized data. Accuracy is measured relative to the original
accuracy of the model.
features : list of str, optional
The feature names, in the order that they appear in the data.
categorical_features: list of str, optional
@ -63,6 +61,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
The required method to train data set for minimizing. Default is
to train the tree just on the features that are given as
features_to_minimize.
is_regression : Bool, optional
Whether the model is a regression model or not (if False, assumes
a classification model). Default is False.
Attributes
----------

View file

@ -19,20 +19,21 @@ def _load_iris(test_set_size: float = 0.3):
return (x_train, y_train), (x_test, y_test)
def get_iris_dataset():
def get_iris_dataset(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Entire dataset and labels as numpy array.
"""
return _load_iris()
return _load_iris(test_set)
def get_german_credit_dataset(test_set: float = 0.3):
"""
Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
:return: Dataset and labels as pandas dataframes.
"""