Add column distribution comparison, and a third method for dataset asssessment by membership classification (#84)

* Add column distribution comparison, and a third method for dataset assessment by membership classification * Address review comments, add additional distribution comparison tests and make them externally configurable too, in addition to the alpha becoming configurable. Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
2026-04-26 05:16:22 +02:00 · 2023-09-21 16:43:19 +03:00 · 2023-09-21 16:43:19 +03:00 · a40484e0c9
commit a40484e0c9
parent 13a0567183
8 changed files with 676 additions and 205 deletions
--- a/apt/risk/data_assessment/dataset_attack.py
+++ b/apt/risk/data_assessment/dataset_attack.py
@ -16,59 +16,68 @@ from apt.utils.datasets import ArrayDataset

 class Config(abc.ABC):
    """
-    The base class for dataset attack configurations
+        The base class for dataset attack configurations
    """
    pass


 class DatasetAttack(abc.ABC):
    """
-     The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model
-     training. The original data members (training data) and non-members (the holdout data) should be available.
-     For reliability, all the datasets should be preprocessed and normalized.
-
-     :param original_data_members: A container for the training original samples and labels,
-            only samples are used in the assessment
-     :param original_data_non_members: A container for the holdout original samples and labels,
-            only samples are used in the assessment
-     :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
-     :param config: Configuration parameters to guide the assessment process
-     :param dataset_name: A name to identify the dataset under attack, optional
-     :param attack_strategy_utils: Utils for use with the attack strategy, optional
+         The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model
+         training. The original data members (training data) and non-members (the holdout data) should be available.
+         For reliability, all the datasets should be preprocessed and normalized.
    """

    def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
                 synthetic_data: ArrayDataset, config: Config, dataset_name: str,
+                 categorical_features: list = [],
                 attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None:
+        """
+        :param original_data_members: A container for the training original samples and labels,
+            only samples are used in the assessment
+        :param original_data_non_members: A container for the holdout original samples and labels,
+            only samples are used in the assessment
+        :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
+        :param config: Configuration parameters to guide the assessment process
+        :param dataset_name: A name to identify the dataset under attack, optional
+        :param categorical_features: The list of categorical features (column names for pandas and column indexes for
+        numpy), optional
+        :param attack_strategy_utils: Utils for use with the attack strategy, optional
+        """
+
        self.original_data_members = original_data_members
        self.original_data_non_members = original_data_non_members
        self.synthetic_data = synthetic_data
        self.config = config
-        self.attack_strategy_utils = attack_strategy_utils
        self.dataset_name = dataset_name
+        self.categorical_features = categorical_features
+        self.attack_strategy_utils = attack_strategy_utils

    @abc.abstractmethod
    def assess_privacy(self) -> DatasetAttackScore:
        """
-        Assess the privacy of the dataset.
-
+        Assess the privacy of the dataset
        :return:
            score: DatasetAttackScore the privacy attack risk score
        """
        pass

+    @property
+    @abc.abstractmethod
+    def short_name(self):
+        pass
+

 class DatasetAttackMembership(DatasetAttack):
    """
-    An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level.
+         An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level.
    """

    @abc.abstractmethod
    def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership,
                                generate_plot: bool = False) -> DatasetAttackScore:
        """
-        Calculate dataset privacy score based on the result of the privacy attack.
-
+        Calculate dataset privacy score based on the result of the privacy attack
        :return:
            score: DatasetAttackScore
        """
@ -78,12 +87,11 @@ class DatasetAttackMembership(DatasetAttack):
    def plot_roc_curve(dataset_name: str, member_probabilities: np.ndarray, non_member_probabilities: np.ndarray,
                       filename_prefix: str = ""):
        """
-        Plot ROC curve.
-
-        :param dataset_name: dataset name, will become part of the plot filename.
-        :param member_probabilities: probability estimates of the member samples, the training data.
-        :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data.
-        :param filename_prefix: name prefix for the ROC curve plot.
+        Plot ROC curve
+        :param dataset_name: dataset name, will become part of the plot filename
+        :param member_probabilities: probability estimates of the member samples, the training data
+        :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data
+        :param filename_prefix: name prefix for the ROC curve plot
        """
        labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities),))))
        results = np.concatenate((non_member_probabilities, member_probabilities))
@ -95,10 +103,9 @@ class DatasetAttackMembership(DatasetAttack):
    @staticmethod
    def calculate_metrics(member_probabilities: np.ndarray, non_member_probabilities: np.ndarray):
        """
-        Calculate attack performance metrics.
-
-        :param member_probabilities: probability estimates of the member samples, the training data.
-        :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data.
+        Calculate attack performance metrics
+        :param member_probabilities: probability estimates of the member samples, the training data
+        :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data
        :return:
            fpr: False Positive rate
            tpr: True Positive rate