Unite the interface so that the main method assess_privacy always returns a score, but the score may also contain an attack result, which can be further analyzed

Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
2026-07-23 17:01:03 +02:00 · 2023-03-08 12:25:58 +02:00 · 2023-03-08 12:25:58 +02:00 · 3ae64054f8
commit 3ae64054f8
parent 69a9a8fa2b
6 changed files with 42 additions and 54 deletions
--- a/apt/risk/data_assessment/init.py
+++ b/apt/risk/data_assessment/init.py
@ -3,7 +3,9 @@ Module providing privacy risk assessment for synthetic data.

 The main interface, ``DatasetAttack``, with the assess_privacy() main method assumes the availability of the
 training data, holdout data and synthetic data at the time of the privacy evaluation.
-It is implemented by two types of abstract classes: ``DatasetAttackPerRecord`` and ``DatasetAttackWhole``, to be
-implemented by concrete assessment methods.
+It is to be implemented by concrete assessment methods, which can run the assessment on a per-record level,
+or on the whole dataset.
+The abstract class ``DatasetAttackPerRecord`` implements the ``DatasetAttack`` interface, but adds the result
+of the attack, so that the final score contains both the result for further analysis and the calculated score.
 """
 from apt.risk.data_assessment import dataset_attack
--- a/apt/risk/data_assessment/dataset_assessment_manager.py
+++ b/apt/risk/data_assessment/dataset_assessment_manager.py
@ -34,15 +34,15 @@ class DatasetAssessmentManager:
    def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
               synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> (
            DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance):
-        config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False)
+        config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False,
+                                                                 generate_plot=self.config.generate_plots)
        mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
                                                     original_data_non_members,
                                                     synthetic_data,
                                                     dataset_name,
                                                     config_gl)

-        result = mgr.assess_privacy()
-        score_g = mgr.calculate_privacy_score(result, generate_plot=self.config.generate_plots)
+        score_g = mgr.assess_privacy()
        self.attack_scores_per_record_knn_probabilities.append(score_g)

        config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)
--- a/apt/risk/data_assessment/dataset_attack.py
+++ b/apt/risk/data_assessment/dataset_attack.py
@ -2,7 +2,7 @@
 This module defines the interface for privacy risk assessment of synthetic datasets.
 """
 import abc
-from typing import Optional, Union
+from typing import Optional

 import matplotlib.pyplot as plt
 import numpy as np
@ -10,8 +10,7 @@ from sklearn import metrics
 from sklearn.metrics import RocCurveDisplay

 from apt.risk.data_assessment.attack_strategy_utils import AttackStrategyUtils
-from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \
-    DatasetAttackResult
+from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord
 from apt.utils.datasets import ArrayDataset


@ -52,12 +51,11 @@ class DatasetAttack(abc.ABC):
        self.config = config

    @abc.abstractmethod
-    def assess_privacy(self) -> Union[DatasetAttackScore, DatasetAttackResult]:
+    def assess_privacy(self) -> DatasetAttackScore:
        """
        Assess the privacy of the dataset
        :return:
-            result: Union[DatasetAttackScore, DatasetAttackResult] can be either the final privacy attack score,
-            or an intermediate attack result, which can be translated into a privacy score if needed
+            score: DatasetAttackScore the privacy attack score
        """
        pass

@ -67,22 +65,13 @@ class DatasetAttackPerRecord(DatasetAttack):
         An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level.
    """

-    @abc.abstractmethod
-    def assess_privacy(self) -> DatasetAttackResultPerRecord:
-        """
-        Assess the privacy of the dataset
-        :return:
-            result: DatasetAttackResultPerRecord
-        """
-        pass
-
    @abc.abstractmethod
    def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord,
                                generate_plot=False) -> DatasetAttackScore:
        """
-        Calculate dataset privacy score based on the result of the privacy assessment
+        Calculate dataset privacy score based on the result of the privacy attack
        :return:
-            result: DatasetAttackScore
+            score: DatasetAttackScore
        """
        pass

@ -120,18 +109,3 @@ class DatasetAttackPerRecord(DatasetAttack):
        auc = metrics.roc_auc_score(labels, results)
        ap = metrics.average_precision_score(labels, results)
        return fpr, tpr, threshold, auc, ap
-
-
-class DatasetAttackWhole(DatasetAttack):
-    """
-         An abstract base class for performing privacy risk assessment for synthetic datasets on a whole-dataset level.
-    """
-
-    @abc.abstractmethod
-    def assess_privacy(self) -> DatasetAttackScore:
-        """
-        Assess the privacy of the dataset
-        :return:
-            result: DatasetAttackScore
-        """
-        pass
--- a/apt/risk/data_assessment/dataset_attack_result.py
+++ b/apt/risk/data_assessment/dataset_attack_result.py
@ -1,19 +1,24 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field

 import numpy as np


+@dataclass
+class DatasetAttackScore:
+    dataset_name: str
+
+
@dataclass
 class DatasetAttackResult:
    dataset_name: str


@dataclass
-class DatasetAttackResultPerRecord(DatasetAttackResult):
-    positive_probabilities: np.ndarray
-    negative_probabilities: np.ndarray
+class DatasetAttackScoreWithResult(DatasetAttackScore):
+    result: DatasetAttackResult = field(repr=False)


@dataclass
-class DatasetAttackScore:
-    dataset_name: str
+class DatasetAttackResultPerRecord(DatasetAttackResult):
+    positive_probabilities: np.ndarray
+    negative_probabilities: np.ndarray
--- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py
+++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py
@ -12,7 +12,7 @@ import numpy as np
 from sklearn.neighbors import NearestNeighbors

 from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
-from apt.risk.data_assessment.dataset_attack import DatasetAttackWhole, Config
+from apt.risk.data_assessment.dataset_attack import Config, DatasetAttack
 from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore
 from apt.utils.datasets import ArrayDataset

@ -51,7 +51,7 @@ class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
    assessment_type: str = 'WholeDatasetKnnDistance'


-class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
+class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
    """
         Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
         members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
@ -89,7 +89,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
        Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
        DCR computed by 'calculate_distances()'.
        :return:
-            :result of the attack, based on the NN distances from the query samples to the synthetic data samples
+            :score of the attack, based on the NN distances from the query samples to the synthetic data samples
        """
        member_distances, non_member_distances = self.calculate_distances()
        # distance of the synth. records to members and to non-members
--- a/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py
+++ b/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py
@ -12,7 +12,8 @@ from sklearn.neighbors import NearestNeighbors

 from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
 from apt.risk.data_assessment.dataset_attack import DatasetAttackPerRecord, Config
-from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord
+from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \
+    DatasetAttackScoreWithResult
 from apt.utils.datasets import ArrayDataset


@ -29,16 +30,18 @@ class DatasetAttackConfigPerRecordKnnProbabilities(Config):
            See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
        distance_params:  Additional keyword arguments for the distance computation function, see 'metric_params' in
            sklearn.neighbors.NearestNeighbors documentation.
+        generate_plot: Generate or not an AUR ROC curve and persist it in a file
    """
    k: int = 5
    use_batches: bool = False
    batch_size: int = 10
    compute_distance: Callable = None
    distance_params: dict = None
+    generate_plot: bool = False


@dataclass
-class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScore):
+class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScoreWithResult):
    """DatasetAttackPerRecordKnnProbabilities privacy score.
    Attributes
    ----------
@ -81,7 +84,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
        else:
            self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto')

-    def assess_privacy(self) -> DatasetAttackResultPerRecord:
+    def assess_privacy(self) -> DatasetAttackScorePerRecordKnnProbabilities:
        """
        Membership Inference Attack which calculates probabilities of positive and negative samples to be generated by
        the synthetic data generator.
@ -94,8 +97,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
        query samples to the synthetic data samples.

        :return
-            :result of the attack with the probabilities of positive and negative samples to be generated by the
-                synthetic data generator based on the NN distances from the query samples to the synthetic data samples
+            :score Privacy score of the attack together with the attack result with the probabilities of positive and
+                negative samples to be generated by the synthetic data generator based on the NN distances from the
+                query samples to the synthetic data samples
        """
        # nearest neighbor search
        self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data)
@ -110,7 +114,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):

        result = DatasetAttackResultPerRecord(self.dataset_name, positive_probabilities=pos_proba,
                                              negative_probabilities=neg_proba)
-        return result
+
+        score = self.calculate_privacy_score(result, self.config.generate_plot)
+        return score

    def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord,
                                generate_plot=False) -> DatasetAttackScore:
@ -126,8 +132,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
        pos_proba, neg_proba = \
            dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities
        fpr, tpr, threshold, auc, ap = self.calculate_metrics(pos_proba, neg_proba)
-        score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name, roc_auc_score=auc,
-                                                            average_precision_score=ap)
+        score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name,
+                                                            result=dataset_attack_result,
+                                                            roc_auc_score=auc, average_precision_score=ap)
        if generate_plot:
            self.plot_roc_curve(pos_proba, neg_proba)
        return score