Fix share calculation, and find only 1 KNN per sample for it

Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
2026-07-23 17:01:03 +02:00 · 2023-03-07 23:03:41 +02:00 · 2023-03-07 23:03:41 +02:00 · 185d9b9664
commit 185d9b9664
parent e5f6089b23
4 changed files with 48 additions and 37 deletions
--- a/apt/risk/data_assessment/attack_strategy_utils.py
+++ b/apt/risk/data_assessment/attack_strategy_utils.py
@ -19,13 +19,11 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
         Common utilities for attack strategy based on KNN distances.
    """

-    def __init__(self, k: int, use_batches: bool = False, batch_size: int = 10) -> None:
+    def __init__(self, use_batches: bool = False, batch_size: int = 10) -> None:
        """
-        :param k: How many nearest neighbors to search
        :param use_batches: Use batches with a progress meter or not when finding KNNs for query set
        :param batch_size: if use_batches=True, the size of batch_size should be > 0
        """
-        self.k = k
        self.use_batches = use_batches
        self.batch_size = batch_size
        if use_batches:
@ -49,7 +47,7 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
        """
        samples = query_samples.get_samples()
        if not self.use_batches:
-            distances, _ = knn_learner.kneighbors(samples, self.k, return_distance=True)
+            distances, _ = knn_learner.kneighbors(samples, return_distance=True)
            if distance_processor:
                return distance_processor(distances)
            else:
@ -61,7 +59,7 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
            x_batch = np.reshape(x_batch, [self.batch_size, -1])

            # dist_batch: distance between every query sample in batch to its KNNs among training samples
-            dist_batch, _ = knn_learner.kneighbors(x_batch, self.k, return_distance=True)
+            dist_batch, _ = knn_learner.kneighbors(x_batch, return_distance=True)

            # The probability of each sample to be generated
            if distance_processor:
--- a/apt/risk/data_assessment/dataset_assessment_manager.py
+++ b/apt/risk/data_assessment/dataset_assessment_manager.py
@ -3,10 +3,12 @@ from typing import Optional

 import pandas as pd

-from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import DatasetAttackConfigPerRecordKnnProbabilities, \
-    DatasetAttackPerRecordKnnProbabilities, DatasetAttackScorePerRecordKnnProbabilities
-from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import DatasetAttackConfigWholeDatasetKnnDistance, \
-    DatasetAttackWholeDatasetKnnDistance, DatasetAttackScoreWholeDatasetKnnDistance
+from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \
+    DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \
+    DatasetAttackScoreWholeDatasetKnnDistance
+from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import \
+    DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \
+    DatasetAttackScorePerRecordKnnProbabilities
 from apt.utils.datasets import ArrayDataset


@ -32,7 +34,7 @@ class DatasetAssessmentManager:
    def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
               synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> (
            DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance):
-        config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False, k=5)
+        config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False)
        mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
                                                     original_data_non_members,
                                                     synthetic_data,
@ -43,7 +45,7 @@ class DatasetAssessmentManager:
        score_g = mgr.calculate_privacy_score(result, generate_plot=self.config.generate_plots)
        self.attack_scores_per_record_knn_probabilities.append(score_g)

-        config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False, k=5)
+        config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)
        mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data,
                                                     dataset_name,
                                                     config_h)
--- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py
+++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py
@ -1,6 +1,8 @@
 """
-This module implements privacy risk assessment of synthetic datasets based on the paper
-"Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer.
+This module implements privacy risk assessment of synthetic datasets based on the papers
+"Data Synthesis based on Generative Adversarial Networks." by N. Park, M. Mohammadi, K. Gorde, S. Jajodia, H. Park,
+and Y. Kim in International Conference on Very Large Data Bases (VLDB), 2018.
+and "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer.
 and on a variation of its reference implementation in https://github.com/mostly-ai/paper-fidelity-accuracy.
 """
 from dataclasses import dataclass
@ -14,13 +16,14 @@ from apt.risk.data_assessment.dataset_attack import DatasetAttackWhole, Config
 from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore
 from apt.utils.datasets import ArrayDataset

+K = 1  # Number of nearest neighbors to search. For DCR we need only the nearest neighbor.
+

@dataclass
 class DatasetAttackConfigWholeDatasetKnnDistance(Config):
    """Configuration for DatasetAttackWholeDatasetKnnDistance.

    Attributes:
-        k: Number of nearest neighbors to search
        use_batches:  Divide query samples into batches or not.
        batch_size:   Query sample batch size.
        compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return
@ -29,7 +32,6 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config):
        distance_params:  Additional keyword arguments for the distance computation function, see 'metric_params' in
            sklearn.neighbors.NearestNeighbors documentation.
    """
-    k: int = 1
    use_batches: bool = False
    batch_size: int = 10
    compute_distance: callable = None
@ -41,7 +43,8 @@ class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
    """Configuration for DatasetAttackWholeDatasetKnnDistance.
    Attributes
    ----------
-    share : the share of synthetic records closer to the training than the holdout dataset
+    share : the share of synthetic records closer to the training than the holdout dataset.
+            A value of 0.5 or close to it means good privacy.
    assessment_type : assessment type is 'WholeDatasetKnnDistance', to be used in reports
    """
    share: float
@ -53,11 +56,14 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
         Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
         members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
         records closer to the training than the holdout dataset.
+         By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
+         configuration instead.
    """

    def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
                 synthetic_data: ArrayDataset, dataset_name: str,
-                 config: Optional[DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
+                 config: Optional[
+                     DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
        """
        :param original_data_members: A container for the training original samples and labels
        :param original_data_non_members: A container for the holdout original samples and labels
@ -66,44 +72,47 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
        :param config: Configuration parameters to guide the assessment process such as which attack
               frameworks to use, optional
        """
-        attack_strategy_utils = KNNAttackStrategyUtils(config.k, config.use_batches, config.batch_size)
+        attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
        super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
                         attack_strategy_utils, config)
        if config.compute_distance:
-            self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto',
-                                                   metric=config.compute_distance,
+            self.nn_obj_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance,
                                                   metric_params=config.distance_params)
-            self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto',
-                                                       metric=config.compute_distance,
+            self.nn_obj_non_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance,
                                                       metric_params=config.distance_params)
        else:
-            self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
-            self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
+            self.nn_obj_members = NearestNeighbors(n_neighbors=K)
+            self.nn_obj_non_members = NearestNeighbors(n_neighbors=K)

    def assess_privacy(self) -> DatasetAttackScoreWholeDatasetKnnDistance:
        """
-        Calculate the share of synthetic records closer to the training than the holdout dataset
+        Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
+        DCR computed by 'calculate_distances()'.
        :return:
            :result of the attack, based on the NN distances from the query samples to the synthetic data samples
        """
        member_distances, non_member_distances = self.calculate_distances()
-        n_members = len(member_distances)
-        n_non_members = len(non_member_distances)
-        assert (n_members == n_non_members)  # distance of the synth. records to members and to non-members
+        # distance of the synth. records to members and to non-members
+        assert (len(member_distances) == len(non_member_distances))
+        n_members = len(self.original_data_members.get_samples())
+        n_non_members = len(self.original_data_non_members.get_samples())
+
        # percent of synth. records closer to members,
        # and half those whose distance is similar to members and non-members
-        share = np.mean(member_distances < non_member_distances) + 0.5 * np.mean(
+        share = np.mean(member_distances < non_member_distances) + (n_members / (n_members + n_non_members)) * np.mean(
            member_distances == non_member_distances)
        score = DatasetAttackScoreWholeDatasetKnnDistance(self.dataset_name, share=share)
        return score

    def calculate_distances(self):
        """
-        Calculate positive and negative query probabilities, based on their distance to their KNNs among
-        synthetic samples.
+        Calculate positive and negative query probabilities, based on their distance to their KNN among
+        synthetic samples. This distance is called distance to the closest record (DCR), as defined by
+        N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks."
+
        :return:
-            pos_distances: distances of each synthetic data member from its nearest training samples
-            neg_distances: distances of each synthetic data member from its nearest validation samples
+            pos_distances: distances of each synthetic data member from its nearest training sample
+            neg_distances: distances of each synthetic data member from its nearest validation sample
        """
        # nearest neighbor search
        self.attack_strategy_utils.fit(self.original_data_members, self.nn_obj_members)
--- a/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py
+++ b/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py
@ -30,7 +30,7 @@ class DatasetAttackConfigPerRecordKnnProbabilities(Config):
        distance_params:  Additional keyword arguments for the distance computation function, see 'metric_params' in
            sklearn.neighbors.NearestNeighbors documentation.
    """
-    k: int = 1
+    k: int = 5
    use_batches: bool = False
    batch_size: int = 10
    compute_distance: Callable = None
@ -42,8 +42,8 @@ class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScore):
    """DatasetAttackPerRecordKnnProbabilities privacy score.
    Attributes
    ----------
-    roc_auc_score :    the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
-                        performance.
+    roc_auc_score :   the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
+                      performance.
    average_precision_score: the proportion of Predicted Positive cases that are correctly Real Positives (members)
    assessment_type : assessment type is 'PerRecordKnnProbabilities', to be used in reports
    """
@ -56,6 +56,8 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
    """
         Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of
         members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.
+         By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
+         configuration instead.
         The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure.
    """

@ -70,7 +72,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
        :param dataset_name: A name to identify this dataset
        :param config: Configuration parameters to guide the attack, optional
        """
-        attack_strategy_utils = KNNAttackStrategyUtils(config.k, config.use_batches, config.batch_size)
+        attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
        super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
                         attack_strategy_utils, config)
        if config.compute_distance: