Documentation updates (#29)

* Bump version to 0.1.0 (breaking changes to some APIs)

* Update documentation

* Update requirements

* gitignore
This commit is contained in:
abigailgold 2022-05-02 11:46:18 +03:00 committed by GitHub
parent 014aed9670
commit fd6be8e778
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 640 additions and 298 deletions

View file

@ -19,27 +19,25 @@ class Anonymize:
Based on the implementation described in: https://arxiv.org/abs/2007.13086
Parameters
----------
k : int
The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
quasi_identifiers : np.ndarray or list
The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
categorical_features : list, optional
The list of categorical features (should only be supplied when passing data as a
pandas dataframe.
is_regression : Bool, optional
Whether the model is a regression model or not (if False, assumes
a classification model). Default is False.
train_only_QI : Bool, optional
The required method to train data set for anonymization. Default is
to train the tree on all features.
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
:type k: int
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
:type quasi_identifiers: np.ndarray or list
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
before using them to train the decision tree model).
:type categorical_features: list, optional
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
Default is False.
:type is_regression: list, optional
:param train_only_QI: The required method to train data set for anonymization. Default is
to train the tree on all features.
:type train_only_QI: boolean, optional
"""
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
is_regression=False, train_only_QI=False):
is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
if k < 2:
raise ValueError("k should be a positive integer with a value of 2 or higher")
if quasi_identifiers is None or len(quasi_identifiers) < 1:
@ -59,7 +57,9 @@ class Anonymize:
:param dataset: Data wrapper containing the training data for the model and the predictions of the
original model on the training data.
:return: An array containing the anonymized training dataset.
:type dataset: `ArrayDataset`
:return: The anonymized training dataset as either numpy array or pandas DataFrame (depending on the type of
the original data used to create the ArrayDataset).
"""
if dataset.get_samples().shape[1] != 0:
self.features = [i for i in range(dataset.get_samples().shape[1])]