From 185d9b9664dd6cb9d7b1ba1ef178e1612cc271b0 Mon Sep 17 00:00:00 2001 From: Maya Anderson Date: Tue, 7 Mar 2023 23:03:41 +0200 Subject: [PATCH] Fix share calculation, and find only 1 KNN per sample for it Signed-off-by: Maya Anderson --- .../data_assessment/attack_strategy_utils.py | 8 ++- .../dataset_assessment_manager.py | 14 ++--- ...taset_attack_whole_dataset_knn_distance.py | 53 +++++++++++-------- ...ecord_knn_probabilities_dataset_attack_.py | 10 ++-- 4 files changed, 48 insertions(+), 37 deletions(-) diff --git a/apt/risk/data_assessment/attack_strategy_utils.py b/apt/risk/data_assessment/attack_strategy_utils.py index 8871ecf..415dae5 100644 --- a/apt/risk/data_assessment/attack_strategy_utils.py +++ b/apt/risk/data_assessment/attack_strategy_utils.py @@ -19,13 +19,11 @@ class KNNAttackStrategyUtils(AttackStrategyUtils): Common utilities for attack strategy based on KNN distances. """ - def __init__(self, k: int, use_batches: bool = False, batch_size: int = 10) -> None: + def __init__(self, use_batches: bool = False, batch_size: int = 10) -> None: """ - :param k: How many nearest neighbors to search :param use_batches: Use batches with a progress meter or not when finding KNNs for query set :param batch_size: if use_batches=True, the size of batch_size should be > 0 """ - self.k = k self.use_batches = use_batches self.batch_size = batch_size if use_batches: @@ -49,7 +47,7 @@ class KNNAttackStrategyUtils(AttackStrategyUtils): """ samples = query_samples.get_samples() if not self.use_batches: - distances, _ = knn_learner.kneighbors(samples, self.k, return_distance=True) + distances, _ = knn_learner.kneighbors(samples, return_distance=True) if distance_processor: return distance_processor(distances) else: @@ -61,7 +59,7 @@ class KNNAttackStrategyUtils(AttackStrategyUtils): x_batch = np.reshape(x_batch, [self.batch_size, -1]) # dist_batch: distance between every query sample in batch to its KNNs among training samples - dist_batch, _ = knn_learner.kneighbors(x_batch, self.k, return_distance=True) + dist_batch, _ = knn_learner.kneighbors(x_batch, return_distance=True) # The probability of each sample to be generated if distance_processor: diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py index d3bc8de..d33b562 100644 --- a/apt/risk/data_assessment/dataset_assessment_manager.py +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -3,10 +3,12 @@ from typing import Optional import pandas as pd -from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import DatasetAttackConfigPerRecordKnnProbabilities, \ - DatasetAttackPerRecordKnnProbabilities, DatasetAttackScorePerRecordKnnProbabilities -from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import DatasetAttackConfigWholeDatasetKnnDistance, \ - DatasetAttackWholeDatasetKnnDistance, DatasetAttackScoreWholeDatasetKnnDistance +from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \ + DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \ + DatasetAttackScoreWholeDatasetKnnDistance +from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import \ + DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \ + DatasetAttackScorePerRecordKnnProbabilities from apt.utils.datasets import ArrayDataset @@ -32,7 +34,7 @@ class DatasetAssessmentManager: def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> ( DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance): - config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False, k=5) + config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False) mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members, original_data_non_members, synthetic_data, @@ -43,7 +45,7 @@ class DatasetAssessmentManager: score_g = mgr.calculate_privacy_score(result, generate_plot=self.config.generate_plots) self.attack_scores_per_record_knn_probabilities.append(score_g) - config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False, k=5) + config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False) mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data, dataset_name, config_h) diff --git a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py index c45ed66..94a95cb 100644 --- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -1,6 +1,8 @@ """ -This module implements privacy risk assessment of synthetic datasets based on the paper -"Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer. +This module implements privacy risk assessment of synthetic datasets based on the papers +"Data Synthesis based on Generative Adversarial Networks." by N. Park, M. Mohammadi, K. Gorde, S. Jajodia, H. Park, +and Y. Kim in International Conference on Very Large Data Bases (VLDB), 2018. +and "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer. and on a variation of its reference implementation in https://github.com/mostly-ai/paper-fidelity-accuracy. """ from dataclasses import dataclass @@ -14,13 +16,14 @@ from apt.risk.data_assessment.dataset_attack import DatasetAttackWhole, Config from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore from apt.utils.datasets import ArrayDataset +K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest neighbor. + @dataclass class DatasetAttackConfigWholeDatasetKnnDistance(Config): """Configuration for DatasetAttackWholeDatasetKnnDistance. Attributes: - k: Number of nearest neighbors to search use_batches: Divide query samples into batches or not. batch_size: Query sample batch size. compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return @@ -29,7 +32,6 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config): distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in sklearn.neighbors.NearestNeighbors documentation. """ - k: int = 1 use_batches: bool = False batch_size: int = 10 compute_distance: callable = None @@ -41,7 +43,8 @@ class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore): """Configuration for DatasetAttackWholeDatasetKnnDistance. Attributes ---------- - share : the share of synthetic records closer to the training than the holdout dataset + share : the share of synthetic records closer to the training than the holdout dataset. + A value of 0.5 or close to it means good privacy. assessment_type : assessment type is 'WholeDatasetKnnDistance', to be used in reports """ share: float @@ -53,11 +56,14 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole): Privacy risk assessment for synthetic datasets based on distances of synthetic data records from members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic records closer to the training than the holdout dataset. + By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in + configuration instead. """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, dataset_name: str, - config: Optional[DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()): + config: Optional[ + DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()): """ :param original_data_members: A container for the training original samples and labels :param original_data_non_members: A container for the holdout original samples and labels @@ -66,44 +72,47 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole): :param config: Configuration parameters to guide the assessment process such as which attack frameworks to use, optional """ - attack_strategy_utils = KNNAttackStrategyUtils(config.k, config.use_batches, config.batch_size) + attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name, attack_strategy_utils, config) if config.compute_distance: - self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto', - metric=config.compute_distance, + self.nn_obj_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance, metric_params=config.distance_params) - self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto', - metric=config.compute_distance, + self.nn_obj_non_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance, metric_params=config.distance_params) else: - self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto') - self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto') + self.nn_obj_members = NearestNeighbors(n_neighbors=K) + self.nn_obj_non_members = NearestNeighbors(n_neighbors=K) def assess_privacy(self) -> DatasetAttackScoreWholeDatasetKnnDistance: """ - Calculate the share of synthetic records closer to the training than the holdout dataset + Calculate the share of synthetic records closer to the training than the holdout dataset, based on the + DCR computed by 'calculate_distances()'. :return: :result of the attack, based on the NN distances from the query samples to the synthetic data samples """ member_distances, non_member_distances = self.calculate_distances() - n_members = len(member_distances) - n_non_members = len(non_member_distances) - assert (n_members == n_non_members) # distance of the synth. records to members and to non-members + # distance of the synth. records to members and to non-members + assert (len(member_distances) == len(non_member_distances)) + n_members = len(self.original_data_members.get_samples()) + n_non_members = len(self.original_data_non_members.get_samples()) + # percent of synth. records closer to members, # and half those whose distance is similar to members and non-members - share = np.mean(member_distances < non_member_distances) + 0.5 * np.mean( + share = np.mean(member_distances < non_member_distances) + (n_members / (n_members + n_non_members)) * np.mean( member_distances == non_member_distances) score = DatasetAttackScoreWholeDatasetKnnDistance(self.dataset_name, share=share) return score def calculate_distances(self): """ - Calculate positive and negative query probabilities, based on their distance to their KNNs among - synthetic samples. + Calculate positive and negative query probabilities, based on their distance to their KNN among + synthetic samples. This distance is called distance to the closest record (DCR), as defined by + N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks." + :return: - pos_distances: distances of each synthetic data member from its nearest training samples - neg_distances: distances of each synthetic data member from its nearest validation samples + pos_distances: distances of each synthetic data member from its nearest training sample + neg_distances: distances of each synthetic data member from its nearest validation sample """ # nearest neighbor search self.attack_strategy_utils.fit(self.original_data_members, self.nn_obj_members) diff --git a/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py b/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py index 0c7e53d..10929c7 100644 --- a/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py +++ b/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py @@ -30,7 +30,7 @@ class DatasetAttackConfigPerRecordKnnProbabilities(Config): distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in sklearn.neighbors.NearestNeighbors documentation. """ - k: int = 1 + k: int = 5 use_batches: bool = False batch_size: int = 10 compute_distance: Callable = None @@ -42,8 +42,8 @@ class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScore): """DatasetAttackPerRecordKnnProbabilities privacy score. Attributes ---------- - roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack - performance. + roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack + performance. average_precision_score: the proportion of Predicted Positive cases that are correctly Real Positives (members) assessment_type : assessment type is 'PerRecordKnnProbabilities', to be used in reports """ @@ -56,6 +56,8 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): """ Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset. + By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in + configuration instead. The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure. """ @@ -70,7 +72,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): :param dataset_name: A name to identify this dataset :param config: Configuration parameters to guide the attack, optional """ - attack_strategy_utils = KNNAttackStrategyUtils(config.k, config.use_batches, config.batch_size) + attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name, attack_strategy_utils, config) if config.compute_distance: