From e5f6089b23f2c9857e43c1d1602f47e3dee5c92d Mon Sep 17 00:00:00 2001 From: Maya Anderson Date: Tue, 7 Mar 2023 22:54:04 +0200 Subject: [PATCH] Rename gan_leaks=>per_record_knn_probabilities and holdout=>whole_dataset_knn_distance Signed-off-by: Maya Anderson --- .../dataset_assessment_manager.py | 44 ++++++++++--------- ...aset_attack_whole_dataset_knn_distance.py} | 20 ++++----- ...cord_knn_probabilities_dataset_attack_.py} | 23 +++++----- 3 files changed, 46 insertions(+), 41 deletions(-) rename apt/risk/data_assessment/{dataset_attack_holdout.py => dataset_attack_whole_dataset_knn_distance.py} (87%) rename apt/risk/data_assessment/{dataset_attack_gan_leaks.py => per_record_knn_probabilities_dataset_attack_.py} (88%) diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py index 2b5facc..d3bc8de 100644 --- a/apt/risk/data_assessment/dataset_assessment_manager.py +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -3,10 +3,10 @@ from typing import Optional import pandas as pd -from apt.risk.data_assessment.dataset_attack_gan_leaks import DatasetAttackGanLeaksConfig, DatasetAttackGanLeaks, \ - DatasetAttackScoreGanLeaks -from apt.risk.data_assessment.dataset_attack_holdout import DatasetAttackHoldoutConfig, DatasetAttackHoldout, \ - DatasetAttackScoreHoldout +from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import DatasetAttackConfigPerRecordKnnProbabilities, \ + DatasetAttackPerRecordKnnProbabilities, DatasetAttackScorePerRecordKnnProbabilities +from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import DatasetAttackConfigWholeDatasetKnnDistance, \ + DatasetAttackWholeDatasetKnnDistance, DatasetAttackScoreWholeDatasetKnnDistance from apt.utils.datasets import ArrayDataset @@ -20,8 +20,8 @@ class DatasetAssessmentManager: """ The main class for running dataset assessment attacks. """ - gan_leaks_attack_scores = [] - holdout_attack_scores = [] + attack_scores_per_record_knn_probabilities = [] + attack_scores_whole_dataset_knn_distance = [] def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None: """ @@ -31,32 +31,34 @@ class DatasetAssessmentManager: def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> ( - DatasetAttackScoreGanLeaks, DatasetAttackScoreHoldout): - config_gl = DatasetAttackGanLeaksConfig(use_batches=False, k=5) - mgr = DatasetAttackGanLeaks(original_data_members, - original_data_non_members, - synthetic_data, - dataset_name, - config_gl) + DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance): + config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False, k=5) + mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members, + original_data_non_members, + synthetic_data, + dataset_name, + config_gl) result = mgr.assess_privacy() score_g = mgr.calculate_privacy_score(result, generate_plot=self.config.generate_plots) - self.gan_leaks_attack_scores.append(score_g) + self.attack_scores_per_record_knn_probabilities.append(score_g) - config_h = DatasetAttackHoldoutConfig(use_batches=False, k=5) - mgr_h = DatasetAttackHoldout(original_data_members, original_data_non_members, synthetic_data, - dataset_name, - config_h) + config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False, k=5) + mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data, + dataset_name, + config_h) score_h = mgr_h.assess_privacy() - self.holdout_attack_scores.append(score_h) + self.attack_scores_whole_dataset_knn_distance.append(score_h) return score_g, score_h def dump_all_scores_to_files(self): if self.config.persist_reports: results_log_file = "_results.log.csv" - self.dump_scores_to_file(self.gan_leaks_attack_scores, "gan_leaks" + results_log_file, True) - self.dump_scores_to_file(self.holdout_attack_scores, "holdout" + results_log_file, True) + self.dump_scores_to_file(self.attack_scores_per_record_knn_probabilities, + "per_record_knn_probabilities" + results_log_file, True) + self.dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance, + "whole_dataset_knn_distance" + results_log_file, True) @staticmethod def dump_scores_to_file(attack_scores, filename, header: bool): diff --git a/apt/risk/data_assessment/dataset_attack_holdout.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py similarity index 87% rename from apt/risk/data_assessment/dataset_attack_holdout.py rename to apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py index 0f7ff65..c45ed66 100644 --- a/apt/risk/data_assessment/dataset_attack_holdout.py +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -16,8 +16,8 @@ from apt.utils.datasets import ArrayDataset @dataclass -class DatasetAttackHoldoutConfig(Config): - """Configuration for DatasetAttackHoldout. +class DatasetAttackConfigWholeDatasetKnnDistance(Config): + """Configuration for DatasetAttackWholeDatasetKnnDistance. Attributes: k: Number of nearest neighbors to search @@ -37,18 +37,18 @@ class DatasetAttackHoldoutConfig(Config): @dataclass -class DatasetAttackScoreHoldout(DatasetAttackScore): - """Configuration for DatasetAttackHoldout. +class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore): + """Configuration for DatasetAttackWholeDatasetKnnDistance. Attributes ---------- share : the share of synthetic records closer to the training than the holdout dataset - assessment_type : assessment type is 'Holdout', to be used in reports + assessment_type : assessment type is 'WholeDatasetKnnDistance', to be used in reports """ share: float - assessment_type: str = 'Holdout' + assessment_type: str = 'WholeDatasetKnnDistance' -class DatasetAttackHoldout(DatasetAttackWhole): +class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole): """ Privacy risk assessment for synthetic datasets based on distances of synthetic data records from members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic @@ -57,7 +57,7 @@ class DatasetAttackHoldout(DatasetAttackWhole): def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, dataset_name: str, - config: Optional[DatasetAttackHoldoutConfig] = DatasetAttackHoldoutConfig()): + config: Optional[DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()): """ :param original_data_members: A container for the training original samples and labels :param original_data_non_members: A container for the holdout original samples and labels @@ -80,7 +80,7 @@ class DatasetAttackHoldout(DatasetAttackWhole): self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto') self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto') - def assess_privacy(self) -> DatasetAttackScoreHoldout: + def assess_privacy(self) -> DatasetAttackScoreWholeDatasetKnnDistance: """ Calculate the share of synthetic records closer to the training than the holdout dataset :return: @@ -94,7 +94,7 @@ class DatasetAttackHoldout(DatasetAttackWhole): # and half those whose distance is similar to members and non-members share = np.mean(member_distances < non_member_distances) + 0.5 * np.mean( member_distances == non_member_distances) - score = DatasetAttackScoreHoldout(self.dataset_name, share=share) + score = DatasetAttackScoreWholeDatasetKnnDistance(self.dataset_name, share=share) return score def calculate_distances(self): diff --git a/apt/risk/data_assessment/dataset_attack_gan_leaks.py b/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py similarity index 88% rename from apt/risk/data_assessment/dataset_attack_gan_leaks.py rename to apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py index d4c6321..0c7e53d 100644 --- a/apt/risk/data_assessment/dataset_attack_gan_leaks.py +++ b/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py @@ -17,8 +17,8 @@ from apt.utils.datasets import ArrayDataset @dataclass -class DatasetAttackGanLeaksConfig(Config): - """Configuration for DatasetAttackGanLeaks. +class DatasetAttackConfigPerRecordKnnProbabilities(Config): + """Configuration for DatasetAttackPerRecordKnnProbabilities. Attributes: k: Number of nearest neighbors to search @@ -38,20 +38,21 @@ class DatasetAttackGanLeaksConfig(Config): @dataclass -class DatasetAttackScoreGanLeaks(DatasetAttackScore): - """DatasetAttackGanLeaks privacy score. +class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScore): + """DatasetAttackPerRecordKnnProbabilities privacy score. Attributes ---------- - roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack performance. + roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack + performance. average_precision_score: the proportion of Predicted Positive cases that are correctly Real Positives (members) - assessment_type : assessment type is 'GANLeaks', to be used in reports + assessment_type : assessment type is 'PerRecordKnnProbabilities', to be used in reports """ roc_auc_score: float average_precision_score: float - assessment_type: str = 'GANLeaks' + assessment_type: str = 'PerRecordKnnProbabilities' -class DatasetAttackGanLeaks(DatasetAttackPerRecord): +class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): """ Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset. @@ -60,7 +61,8 @@ class DatasetAttackGanLeaks(DatasetAttackPerRecord): def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, dataset_name: str, - config: Optional[DatasetAttackGanLeaksConfig] = DatasetAttackGanLeaksConfig()): + config: Optional[ + DatasetAttackConfigPerRecordKnnProbabilities] = DatasetAttackConfigPerRecordKnnProbabilities()): """ :param original_data_members: A container for the training original samples and labels :param original_data_non_members: A container for the holdout original samples and labels @@ -122,7 +124,8 @@ class DatasetAttackGanLeaks(DatasetAttackPerRecord): pos_proba, neg_proba = \ dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities fpr, tpr, threshold, auc, ap = self.calculate_metrics(pos_proba, neg_proba) - score = DatasetAttackScoreGanLeaks(self.dataset_name, roc_auc_score=auc, average_precision_score=ap) + score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name, roc_auc_score=auc, + average_precision_score=ap) if generate_plot: self.plot_roc_curve(pos_proba, neg_proba) return score