mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-29 15:59:38 +02:00
Documentation updates (#29)
* Bump version to 0.1.0 (breaking changes to some APIs) * Update documentation * Update requirements * gitignore
This commit is contained in:
parent
014aed9670
commit
fd6be8e778
12 changed files with 640 additions and 298 deletions
|
|
@ -19,27 +19,25 @@ class Anonymize:
|
|||
|
||||
Based on the implementation described in: https://arxiv.org/abs/2007.13086
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k : int
|
||||
The privacy parameter that determines the number of records that will be indistinguishable from each
|
||||
other (when looking at the quasi identifiers). Should be at least 2.
|
||||
quasi_identifiers : np.ndarray or list
|
||||
The features that need to be minimized in case of pandas data, and indexes of features
|
||||
in case of numpy data.
|
||||
categorical_features : list, optional
|
||||
The list of categorical features (should only be supplied when passing data as a
|
||||
pandas dataframe.
|
||||
is_regression : Bool, optional
|
||||
Whether the model is a regression model or not (if False, assumes
|
||||
a classification model). Default is False.
|
||||
train_only_QI : Bool, optional
|
||||
The required method to train data set for anonymization. Default is
|
||||
to train the tree on all features.
|
||||
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
|
||||
other (when looking at the quasi identifiers). Should be at least 2.
|
||||
:type k: int
|
||||
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
|
||||
in case of numpy data.
|
||||
:type quasi_identifiers: np.ndarray or list
|
||||
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot encoded
|
||||
before using them to train the decision tree model).
|
||||
:type categorical_features: list, optional
|
||||
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
|
||||
Default is False.
|
||||
:type is_regression: list, optional
|
||||
:param train_only_QI: The required method to train data set for anonymization. Default is
|
||||
to train the tree on all features.
|
||||
:type train_only_QI: boolean, optional
|
||||
"""
|
||||
|
||||
def __init__(self, k: int, quasi_identifiers: Union[np.ndarray, list], categorical_features: Optional[list] = None,
|
||||
is_regression=False, train_only_QI=False):
|
||||
is_regression: Optional[bool] = False, train_only_QI: Optional[bool] = False):
|
||||
if k < 2:
|
||||
raise ValueError("k should be a positive integer with a value of 2 or higher")
|
||||
if quasi_identifiers is None or len(quasi_identifiers) < 1:
|
||||
|
|
@ -59,7 +57,9 @@ class Anonymize:
|
|||
|
||||
:param dataset: Data wrapper containing the training data for the model and the predictions of the
|
||||
original model on the training data.
|
||||
:return: An array containing the anonymized training dataset.
|
||||
:type dataset: `ArrayDataset`
|
||||
:return: The anonymized training dataset as either numpy array or pandas DataFrame (depending on the type of
|
||||
the original data used to create the ArrayDataset).
|
||||
"""
|
||||
if dataset.get_samples().shape[1] != 0:
|
||||
self.features = [i for i in range(dataset.get_samples().shape[1])]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue