From 4988fea08c6a4c916e61bb4f5483c79b829b3db9 Mon Sep 17 00:00:00 2001 From: Maya Anderson Date: Thu, 9 Mar 2023 23:17:37 +0200 Subject: [PATCH] Rename DatasetAttackPerRecordKnnProbabilities => DatasetAttackMembershipKnnProbabilities Signed-off-by: Maya Anderson --- apt/risk/data_assessment/__init__.py | 5 +- .../dataset_assessment_manager.py | 22 +++---- apt/risk/data_assessment/dataset_attack.py | 26 ++++---- ...et_attack_membership_knn_probabilities.py} | 66 +++++++++---------- .../data_assessment/dataset_attack_result.py | 8 +-- ...taset_attack_whole_dataset_knn_distance.py | 14 ++-- 6 files changed, 70 insertions(+), 71 deletions(-) rename apt/risk/data_assessment/{dataset_attack_per_record_knn_probabilities.py => dataset_attack_membership_knn_probabilities.py} (71%) diff --git a/apt/risk/data_assessment/__init__.py b/apt/risk/data_assessment/__init__.py index 071a691..c3cdf9a 100644 --- a/apt/risk/data_assessment/__init__.py +++ b/apt/risk/data_assessment/__init__.py @@ -5,7 +5,8 @@ The main interface, ``DatasetAttack``, with the assess_privacy() main method ass training data, holdout data and synthetic data at the time of the privacy evaluation. It is to be implemented by concrete assessment methods, which can run the assessment on a per-record level, or on the whole dataset. -The abstract class ``DatasetAttackPerRecord`` implements the ``DatasetAttack`` interface, but adds the result -of the attack, so that the final score contains both the result for further analysis and the calculated score. +The abstract class ``DatasetAttackMembership`` implements the ``DatasetAttack`` interface, but adds the result +of the membership inference attack, so that the final score contains both the membership inference attack result +for further analysis and the calculated score. """ from apt.risk.data_assessment import dataset_attack diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py index 310b44e..b3c2b3e 100644 --- a/apt/risk/data_assessment/dataset_assessment_manager.py +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -7,11 +7,9 @@ import pandas as pd from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \ - DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \ - DatasetAttackScoreWholeDatasetKnnDistance -from apt.risk.data_assessment.dataset_attack_per_record_knn_probabilities import \ - DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \ - DatasetAttackScorePerRecordKnnProbabilities + DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance +from apt.risk.data_assessment.dataset_attack_membership_knn_probabilities import \ + DatasetAttackConfigMembershipKnnProbabilities, DatasetAttackMembershipKnnProbabilities from apt.utils.datasets import ArrayDataset @@ -49,13 +47,13 @@ class DatasetAssessmentManager: :return: a list of dataset attack scores """ - config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False, - generate_plot=self.config.generate_plots) - mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members, - original_data_non_members, - synthetic_data, - config_gl, - dataset_name) + config_gl = DatasetAttackConfigMembershipKnnProbabilities(use_batches=False, + generate_plot=self.config.generate_plots) + mgr = DatasetAttackMembershipKnnProbabilities(original_data_members, + original_data_non_members, + synthetic_data, + config_gl, + dataset_name) score_g = mgr.assess_privacy() self.attack_scores_per_record_knn_probabilities.append(score_g) diff --git a/apt/risk/data_assessment/dataset_attack.py b/apt/risk/data_assessment/dataset_attack.py index f92f5ce..794e38b 100644 --- a/apt/risk/data_assessment/dataset_attack.py +++ b/apt/risk/data_assessment/dataset_attack.py @@ -10,7 +10,7 @@ from sklearn import metrics from sklearn.metrics import RocCurveDisplay from apt.risk.data_assessment.attack_strategy_utils import AttackStrategyUtils -from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultMembership from apt.utils.datasets import ArrayDataset @@ -60,13 +60,13 @@ class DatasetAttack(abc.ABC): pass -class DatasetAttackPerRecord(DatasetAttack): +class DatasetAttackMembership(DatasetAttack): """ An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level. """ @abc.abstractmethod - def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord, + def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership, generate_plot=False) -> DatasetAttackScore: """ Calculate dataset privacy score based on the result of the privacy attack @@ -75,15 +75,15 @@ class DatasetAttackPerRecord(DatasetAttack): """ pass - def plot_roc_curve(self, pos_probabilities, neg_probabilities, name_prefix=""): + def plot_roc_curve(self, member_probabilities, non_member_probabilities, name_prefix=""): """ Plot ROC curve - :param pos_probabilities: probability estimates of the positive samples, the training data - :param neg_probabilities: probability estimates of the negative samples, the hold-out data + :param member_probabilities: probability estimates of the member samples, the training data + :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data :param name_prefix: name prefix for the ROC curve plot """ - labels = np.concatenate((np.zeros((len(neg_probabilities),)), np.ones((len(pos_probabilities),)))) - results = np.concatenate((neg_probabilities, pos_probabilities)) + labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities),)))) + results = np.concatenate((non_member_probabilities, member_probabilities)) svc_disp = RocCurveDisplay.from_predictions(labels, results) svc_disp.plot() plt.plot([0, 1], [0, 1], color="navy", linewidth=2, linestyle="--", label='No skills') @@ -91,11 +91,11 @@ class DatasetAttackPerRecord(DatasetAttack): plt.savefig(f'{name_prefix}{self.dataset_name}_roc_curve.png') @staticmethod - def calculate_metrics(pos_probabilities, neg_probabilities): + def calculate_metrics(member_probabilities, non_member_probabilities): """ Calculate attack performance metrics - :param pos_probabilities: probability estimates of the positive samples, the training data - :param neg_probabilities: probability estimates of the negative samples, the hold-out data + :param member_probabilities: probability estimates of the member samples, the training data + :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data :return: fpr: False Positive rate tpr: True Positive rate @@ -103,8 +103,8 @@ class DatasetAttackPerRecord(DatasetAttack): auc: area under the Receiver Operating Characteristic Curve ap: average precision score """ - labels = np.concatenate((np.zeros((len(neg_probabilities),)), np.ones((len(pos_probabilities))))) - results = np.concatenate((neg_probabilities, pos_probabilities)) + labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities))))) + results = np.concatenate((non_member_probabilities, member_probabilities)) fpr, tpr, threshold = metrics.roc_curve(labels, results, pos_label=1) auc = metrics.roc_auc_score(labels, results) ap = metrics.average_precision_score(labels, results) diff --git a/apt/risk/data_assessment/dataset_attack_per_record_knn_probabilities.py b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py similarity index 71% rename from apt/risk/data_assessment/dataset_attack_per_record_knn_probabilities.py rename to apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py index 5fddb81..3891ad1 100644 --- a/apt/risk/data_assessment/dataset_attack_per_record_knn_probabilities.py +++ b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py @@ -11,15 +11,15 @@ import numpy as np from sklearn.neighbors import NearestNeighbors from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils -from apt.risk.data_assessment.dataset_attack import DatasetAttackPerRecord, Config -from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \ +from apt.risk.data_assessment.dataset_attack import DatasetAttackMembership, Config +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultMembership, \ DatasetAttackScoreWithResult, DEFAULT_DATASET_NAME from apt.utils.datasets import ArrayDataset @dataclass -class DatasetAttackConfigPerRecordKnnProbabilities(Config): - """Configuration for DatasetAttackPerRecordKnnProbabilities. +class DatasetAttackConfigMembershipKnnProbabilities(Config): + """Configuration for DatasetAttackMembershipKnnProbabilities. Attributes: k: Number of nearest neighbors to search @@ -41,21 +41,21 @@ class DatasetAttackConfigPerRecordKnnProbabilities(Config): @dataclass -class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScoreWithResult): - """DatasetAttackPerRecordKnnProbabilities privacy score. +class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScoreWithResult): + """DatasetAttackMembershipKnnProbabilities privacy score. Attributes ---------- roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack performance. - average_precision_score: the proportion of Predicted Positive cases that are correctly Real Positives (members) - assessment_type : assessment type is 'PerRecordKnnProbabilities', to be used in reports + average_precision_score: the proportion of predicted members that are correctly members + assessment_type : assessment type is 'MembershipKnnProbabilities', to be used in reports """ roc_auc_score: float average_precision_score: float - assessment_type: str = 'PerRecordKnnProbabilities' + assessment_type: str = 'MembershipKnnProbabilities' -class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): +class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): """ Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset. @@ -66,7 +66,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, - config: DatasetAttackConfigPerRecordKnnProbabilities = DatasetAttackConfigPerRecordKnnProbabilities(), + config: DatasetAttackConfigMembershipKnnProbabilities = DatasetAttackConfigMembershipKnnProbabilities(), dataset_name: str = DEFAULT_DATASET_NAME): """ :param original_data_members: A container for the training original samples and labels @@ -84,9 +84,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): else: self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto') - def assess_privacy(self) -> DatasetAttackScorePerRecordKnnProbabilities: + def assess_privacy(self) -> DatasetAttackScoreMembershipKnnProbabilities: """ - Membership Inference Attack which calculates probabilities of positive and negative samples to be generated by + Membership Inference Attack which calculates probabilities of member and non-member samples to be generated by the synthetic data generator. The assumption is that since the generative model is trained to approximate the training data distribution then the probability of a sample to be a member of the training data should be proportional to the probability @@ -97,46 +97,46 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): query samples to the synthetic data samples. :return: - Privacy score of the attack together with the attack result with the probabilities of positive and - negative samples to be generated by the synthetic data generator based on the NN distances from the + Privacy score of the attack together with the attack result with the probabilities of member and + non-member samples to be generated by the synthetic data generator based on the NN distances from the query samples to the synthetic data samples """ # nearest neighbor search self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data) - # positive query - pos_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_members, - self.probability_per_sample) + # members query + member_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_members, + self.probability_per_sample) - # negative query - neg_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_non_members, - self.probability_per_sample) + # non-members query + non_member_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_non_members, + self.probability_per_sample) - result = DatasetAttackResultPerRecord(positive_probabilities=pos_proba, - negative_probabilities=neg_proba) + result = DatasetAttackResultMembership(member_probabilities=member_proba, + non_member_probabilities=non_member_proba) score = self.calculate_privacy_score(result, self.config.generate_plot) return score - def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord, + def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership, generate_plot=False) -> DatasetAttackScore: """ - Evaluate privacy score from the probabilities of positive and negative samples to be generated by the synthetic + Evaluate privacy score from the probabilities of member and non-member samples to be generated by the synthetic data generator. The probabilities are computed by the 'assess_privacy()' method. - :param dataset_attack_result attack result containing probabilities of positive and negative samples to be + :param dataset_attack_result attack result containing probabilities of member and non-member samples to be generated by the synthetic data generator :param generate_plot generate AUC ROC curve plot and persist it :return: score of the attack, based on distance-based probabilities - mainly the ROC AUC score """ - pos_proba, neg_proba = \ - dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities - fpr, tpr, threshold, auc, ap = self.calculate_metrics(pos_proba, neg_proba) - score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name, - result=dataset_attack_result, - roc_auc_score=auc, average_precision_score=ap) + member_proba, non_member_proba = \ + dataset_attack_result.member_probabilities, dataset_attack_result.non_member_probabilities + fpr, tpr, threshold, auc, ap = self.calculate_metrics(member_proba, non_member_proba) + score = DatasetAttackScoreMembershipKnnProbabilities(self.dataset_name, + result=dataset_attack_result, + roc_auc_score=auc, average_precision_score=ap) if generate_plot: - self.plot_roc_curve(pos_proba, neg_proba) + self.plot_roc_curve(member_proba, non_member_proba) return score @staticmethod diff --git a/apt/risk/data_assessment/dataset_attack_result.py b/apt/risk/data_assessment/dataset_attack_result.py index 98b73d3..425c844 100644 --- a/apt/risk/data_assessment/dataset_attack_result.py +++ b/apt/risk/data_assessment/dataset_attack_result.py @@ -2,9 +2,9 @@ from dataclasses import dataclass, field import numpy as np - DEFAULT_DATASET_NAME = "dataset" + @dataclass class DatasetAttackScore: dataset_name: str @@ -21,6 +21,6 @@ class DatasetAttackScoreWithResult(DatasetAttackScore): @dataclass -class DatasetAttackResultPerRecord(DatasetAttackResult): - positive_probabilities: np.ndarray - negative_probabilities: np.ndarray +class DatasetAttackResultMembership(DatasetAttackResult): + member_probabilities: np.ndarray + non_member_probabilities: np.ndarray diff --git a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py index e6ec16f..d85f725 100644 --- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -104,20 +104,20 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): def calculate_distances(self): """ - Calculate positive and negative query probabilities, based on their distance to their KNN among + Calculate member and non-member query probabilities, based on their distance to their KNN among synthetic samples. This distance is called distance to the closest record (DCR), as defined by N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks." :return: - pos_distances - distances of each synthetic data member from its nearest training sample - neg_distances - distances of each synthetic data member from its nearest validation sample + member_distances - distances of each synthetic data member from its nearest training sample + non_member_distances - distances of each synthetic data member from its nearest validation sample """ # nearest neighbor search self.attack_strategy_utils.fit(self.knn_learner_members, self.original_data_members) self.attack_strategy_utils.fit(self.knn_learner_non_members, self.original_data_non_members) - # distances of the synthetic data from the positive and negative samples (members and non-members) - pos_distances = self.attack_strategy_utils.find_knn(self.knn_learner_members, self.synthetic_data) - neg_distances = self.attack_strategy_utils.find_knn(self.knn_learner_non_members, self.synthetic_data) + # distances of the synthetic data from the member and non-member samples + member_distances = self.attack_strategy_utils.find_knn(self.knn_learner_members, self.synthetic_data) + non_member_distances = self.attack_strategy_utils.find_knn(self.knn_learner_non_members, self.synthetic_data) - return pos_distances, neg_distances + return member_distances, non_member_distances