Address review comments - make dataset_name optional, fix filename. Fix score serialization - don't serialize the result.

Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
2026-07-23 17:01:03 +02:00 · 2023-03-09 22:38:39 +02:00 · 2023-03-09 22:38:39 +02:00 · a122976807
commit a122976807
parent 3ae64054f8
6 changed files with 73 additions and 58 deletions
--- a/apt/risk/data_assessment/dataset_assessment_manager.py
+++ b/apt/risk/data_assessment/dataset_assessment_manager.py
@ -1,12 +1,15 @@
-from dataclasses import dataclass
+from __future__ import annotations
 from typing import Optional
+from dataclasses import dataclass
+

 import pandas as pd

+from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME
 from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \
    DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \
    DatasetAttackScoreWholeDatasetKnnDistance
-from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import \
+from apt.risk.data_assessment.dataset_attack_per_record_knn_probabilities import \
    DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \
    DatasetAttackScorePerRecordKnnProbabilities
 from apt.utils.datasets import ArrayDataset
@ -22,8 +25,8 @@ class DatasetAssessmentManager:
    """
    The main class for running dataset assessment attacks.
    """
-    attack_scores_per_record_knn_probabilities = []
-    attack_scores_whole_dataset_knn_distance = []
+    attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = []
+    attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = []

    def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None:
        """
@ -32,27 +35,39 @@ class DatasetAssessmentManager:
        self.config = config

    def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
-               synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> (
-            DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance):
+               synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME) -> list[DatasetAttackScore]:
+        """
+        Do dataset assessment by running dataset attacks, and return their scores.
+
+        :param original_data_members: A container for the training original samples and labels,
+            only samples are used in the assessment
+        :param original_data_non_members: A container for the holdout original samples and labels,
+            only samples are used in the assessment
+        :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
+        :param dataset_name: A name to identify this dataset, optional
+
+        :return:
+            a list of dataset attack scores
+        """
        config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False,
                                                                 generate_plot=self.config.generate_plots)
        mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
                                                     original_data_non_members,
                                                     synthetic_data,
-                                                     dataset_name,
-                                                     config_gl)
+                                                     config_gl,
+                                                     dataset_name)

        score_g = mgr.assess_privacy()
        self.attack_scores_per_record_knn_probabilities.append(score_g)

        config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)
        mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data,
-                                                     dataset_name,
-                                                     config_h)
+                                                     config_h,
+                                                     dataset_name)

        score_h = mgr_h.assess_privacy()
        self.attack_scores_whole_dataset_knn_distance.append(score_h)
-        return score_g, score_h
+        return [score_g, score_h]

    def dump_all_scores_to_files(self):
        if self.config.persist_reports:
@ -63,6 +78,6 @@ class DatasetAssessmentManager:
                                     "whole_dataset_knn_distance" + results_log_file, True)

    @staticmethod
-    def dump_scores_to_file(attack_scores, filename, header: bool):
-        run_results_df = pd.DataFrame(attack_scores)
+    def dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool):
+        run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore')  # don't serialize result
        run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w')  # Overwrite
--- a/apt/risk/data_assessment/dataset_attack.py
+++ b/apt/risk/data_assessment/dataset_attack.py
@ -29,26 +29,26 @@ class DatasetAttack(abc.ABC):
    """

    def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
-                 synthetic_data: ArrayDataset, dataset_name: str, attack_strategy_utils: AttackStrategyUtils,
-                 config: Optional[Config] = Config()) -> None:
+                 synthetic_data: ArrayDataset, config: Config, dataset_name: str,
+                 attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None:
        """
        :param original_data_members: A container for the training original samples and labels,
            only samples are used in the assessment
        :param original_data_non_members: A container for the holdout original samples and labels,
            only samples are used in the assessment
        :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
-        :param dataset_name: A name to identify the dataset under attack
-        :param attack_strategy_utils: Utils for use with the attack strategy
        :param config: Configuration parameters to guide the assessment process such as which attack
-               frameworks to use, optional
+               frameworks to use
+        :param dataset_name: A name to identify the dataset under attack, optional
+        :param attack_strategy_utils: Utils for use with the attack strategy, optional
        """

        self.original_data_members = original_data_members
        self.original_data_non_members = original_data_non_members
        self.synthetic_data = synthetic_data
-        self.dataset_name = dataset_name
-        self.attack_strategy_utils = attack_strategy_utils
        self.config = config
+        self.attack_strategy_utils = attack_strategy_utils
+        self.dataset_name = dataset_name

    @abc.abstractmethod
    def assess_privacy(self) -> DatasetAttackScore:
--- a/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py
+++ b/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py
@ -5,7 +5,7 @@ published in Proceedings of the 2020 ACM SIGSAC Conference on Computer and Commu
 https://doi.org/10.1145/3372297.3417238 and its implementation in https://github.com/DingfanChen/GAN-Leaks.
 """
 from dataclasses import dataclass
-from typing import Optional, Callable
+from typing import Callable

 import numpy as np
 from sklearn.neighbors import NearestNeighbors
@ -13,7 +13,7 @@ from sklearn.neighbors import NearestNeighbors
 from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
 from apt.risk.data_assessment.dataset_attack import DatasetAttackPerRecord, Config
 from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \
-    DatasetAttackScoreWithResult
+    DatasetAttackScoreWithResult, DEFAULT_DATASET_NAME
 from apt.utils.datasets import ArrayDataset


@ -65,19 +65,19 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
    """

    def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
-                 synthetic_data: ArrayDataset, dataset_name: str,
-                 config: Optional[
-                     DatasetAttackConfigPerRecordKnnProbabilities] = DatasetAttackConfigPerRecordKnnProbabilities()):
+                 synthetic_data: ArrayDataset,
+                 config: DatasetAttackConfigPerRecordKnnProbabilities = DatasetAttackConfigPerRecordKnnProbabilities(),
+                 dataset_name: str = DEFAULT_DATASET_NAME):
        """
        :param original_data_members: A container for the training original samples and labels
        :param original_data_non_members: A container for the holdout original samples and labels
        :param synthetic_data: A container for the synthetic samples and labels
-        :param dataset_name: A name to identify this dataset
        :param config: Configuration parameters to guide the attack, optional
+        :param dataset_name: A name to identify this dataset, optional
        """
        attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
-        super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
-                         attack_strategy_utils, config)
+        super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
+                         attack_strategy_utils)
        if config.compute_distance:
            self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto', metric=config.compute_distance,
                                                metric_params=config.distance_params)
@ -91,15 +91,15 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
        The assumption is that since the generative model is trained to approximate the training data distribution
        then the probability of a sample to be a member of the training data should be proportional to the probability
        that the query sample can be generated by the generative model.
-        The assumption is that if the probability that the query sample is generated by the generative model is large,
+        So, if the probability that the query sample is generated by the generative model is large,
        it is more likely that the query sample was used to train the generative model. This probability is approximated
        by the Parzen window density estimation in 'probability_per_sample()', computed from the NN distances from the
        query samples to the synthetic data samples.

-        :return
-            :score Privacy score of the attack together with the attack result with the probabilities of positive and
-                negative samples to be generated by the synthetic data generator based on the NN distances from the
-                query samples to the synthetic data samples
+        :return:
+            Privacy score of the attack together with the attack result with the probabilities of positive and
+            negative samples to be generated by the synthetic data generator based on the NN distances from the
+            query samples to the synthetic data samples
        """
        # nearest neighbor search
        self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data)
@ -112,7 +112,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
        neg_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_non_members,
                                                        self.probability_per_sample)

-        result = DatasetAttackResultPerRecord(self.dataset_name, positive_probabilities=pos_proba,
+        result = DatasetAttackResultPerRecord(positive_probabilities=pos_proba,
                                              negative_probabilities=neg_proba)

        score = self.calculate_privacy_score(result, self.config.generate_plot)
@ -126,8 +126,8 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
        :param dataset_attack_result attack result containing probabilities of positive and negative samples to be
                generated by the synthetic data generator
        :param generate_plot generate AUC ROC curve plot and persist it
-        :return
-            :score of the attack, based on distance-based probabilities - mainly the ROC AUC score
+        :return:
+            score of the attack, based on distance-based probabilities - mainly the ROC AUC score
        """
        pos_proba, neg_proba = \
            dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities
@ -145,7 +145,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
        For every sample represented by its distance from the query sample to its KNN in synthetic data,
        computes the probability of the synthetic data to be part of the query dataset.
        :param distances: distance between every query sample in batch to its KNNs among synthetic samples
-        :return
-            distances: probability estimates of the query samples being generated and so being part of the synthetic set
+        :return:
+            probability estimates of the query samples being generated and so - of being part of the synthetic set
        """
        return np.average(np.exp(-distances), axis=1)
--- a/apt/risk/data_assessment/dataset_attack_result.py
+++ b/apt/risk/data_assessment/dataset_attack_result.py
@ -3,6 +3,8 @@ from dataclasses import dataclass, field
 import numpy as np


+DEFAULT_DATASET_NAME = "dataset"
+
@dataclass
 class DatasetAttackScore:
    dataset_name: str
@ -10,10 +12,10 @@ class DatasetAttackScore:

@dataclass
 class DatasetAttackResult:
-    dataset_name: str
+    pass


-@dataclass
+@dataclass(repr=False)
 class DatasetAttackScoreWithResult(DatasetAttackScore):
    result: DatasetAttackResult = field(repr=False)

--- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py
+++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py
@ -6,14 +6,13 @@ and "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data"
 and on a variation of its reference implementation in https://github.com/mostly-ai/paper-fidelity-accuracy.
 """
 from dataclasses import dataclass
-from typing import Optional

 import numpy as np
 from sklearn.neighbors import NearestNeighbors

 from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
 from apt.risk.data_assessment.dataset_attack import Config, DatasetAttack
-from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore
+from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME
 from apt.utils.datasets import ArrayDataset

 K = 1  # Number of nearest neighbors to search. For DCR we need only the nearest neighbor.
@ -40,7 +39,7 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config):

@dataclass
 class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
-    """Configuration for DatasetAttackWholeDatasetKnnDistance.
+    """DatasetAttackWholeDatasetKnnDistance privacy score.
    Attributes
    ----------
    share : the share of synthetic records closer to the training than the holdout dataset.
@ -61,20 +60,19 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
    """

    def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
-                 synthetic_data: ArrayDataset, dataset_name: str,
-                 config: Optional[
-                     DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
+                 synthetic_data: ArrayDataset,
+                 config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(),
+                 dataset_name: str = DEFAULT_DATASET_NAME):
        """
        :param original_data_members: A container for the training original samples and labels
        :param original_data_non_members: A container for the holdout original samples and labels
        :param synthetic_data: A container for the synthetic samples and labels
-        :param dataset_name: A name to identify this dataset
-        :param config: Configuration parameters to guide the assessment process such as which attack
-               frameworks to use, optional
+        :param config: Configuration parameters to guide the assessment process, optional
+        :param dataset_name: A name to identify this dataset, optional
        """
        attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
-        super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
-                         attack_strategy_utils, config)
+        super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
+                         attack_strategy_utils)
        if config.compute_distance:
            self.knn_learner_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance,
                                                        metric_params=config.distance_params)
@ -89,7 +87,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
        Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
        DCR computed by 'calculate_distances()'.
        :return:
-            :score of the attack, based on the NN distances from the query samples to the synthetic data samples
+            score of the attack, based on the NN distances from the query samples to the synthetic data samples
        """
        member_distances, non_member_distances = self.calculate_distances()
        # distance of the synth. records to members and to non-members
@ -111,8 +109,8 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
        N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks."

        :return:
-            pos_distances: distances of each synthetic data member from its nearest training sample
-            neg_distances: distances of each synthetic data member from its nearest validation sample
+            pos_distances - distances of each synthetic data member from its nearest training sample
+            neg_distances - distances of each synthetic data member from its nearest validation sample
        """
        # nearest neighbor search
        self.attack_strategy_utils.fit(self.knn_learner_members, self.original_data_members)
--- a/tests/test_data_assessment.py
+++ b/tests/test_data_assessment.py
@ -60,8 +60,8 @@ def test_risk_anonymization(name, data, dataset_type, k, mgr):
    original_data_members = ArrayDataset(preprocessed_x_train, y_train)
    original_data_non_members = ArrayDataset(preprocessed_x_test, y_test)

-    score_g, score_h = mgr.assess(original_data_members, original_data_non_members, anonymized_data,
-                                  f'anon_k{k}_{name}')
+    [score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, anonymized_data,
+                                    f'anon_k{k}_{name}')
    assert (score_g.roc_auc_score > 0.5)
    assert (score_g.average_precision_score > 0.5)

@ -96,8 +96,8 @@ def test_risk_kde(name, data, dataset_type, mgr):
    original_data_members = ArrayDataset(encoded, y_train)
    original_data_non_members = ArrayDataset(encoded_test, y_test)

-    score_g, score_h = mgr.assess(original_data_members, original_data_non_members, synth_data,
-                                  'kde' + str(NUM_SYNTH_SAMPLES) + name)
+    [score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, synth_data,
+                                    'kde' + str(NUM_SYNTH_SAMPLES) + name)

    assert (score_g.roc_auc_score > 0.5)
    assert (score_g.average_precision_score > 0.5)