From 80bec0c45ba269ea03bdf7be931314a26cd5d798 Mon Sep 17 00:00:00 2001 From: Maya Anderson Date: Thu, 16 Mar 2023 17:51:49 +0200 Subject: [PATCH] Add a risk score to the base class DatasetAttackScore, so that every implementation could set it based on its specific values. Signed-off-by: Maya Anderson --- .../data_assessment/attack_strategy_utils.py | 4 +-- .../dataset_assessment_manager.py | 2 +- apt/risk/data_assessment/dataset_attack.py | 2 +- ...set_attack_membership_knn_probabilities.py | 27 ++++++++++------ .../data_assessment/dataset_attack_result.py | 3 +- ...taset_attack_whole_dataset_knn_distance.py | 20 +++++++----- tests/test_data_assessment.py | 31 +++++++++++-------- 7 files changed, 53 insertions(+), 36 deletions(-) diff --git a/apt/risk/data_assessment/attack_strategy_utils.py b/apt/risk/data_assessment/attack_strategy_utils.py index 8babf9a..674feff 100644 --- a/apt/risk/data_assessment/attack_strategy_utils.py +++ b/apt/risk/data_assessment/attack_strategy_utils.py @@ -39,8 +39,8 @@ class KNNAttackStrategyUtils(AttackStrategyUtils): :param query_samples: query samples, to which nearest neighbors are to be found :param knn_learner: unsupervised learner for implementing neighbor searches, after it was fitted :param distance_processor: function for processing the distance into another more relevant metric per sample. - Its input is an array representing distances (the distances returned by NearestNeighbors.kneighbors() ), - and the output should be another array with distance-based values that enable to compute the final score + Its input is an array representing distances (the distances returned by NearestNeighbors.kneighbors() ), and + the output should be another array with distance-based values that enable to compute the final risk score :return: distances of the query samples to their nearest neighbors, or a metric based on that distance and calculated by the distance_processor function diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py index b3c2b3e..faeac69 100644 --- a/apt/risk/data_assessment/dataset_assessment_manager.py +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -45,7 +45,7 @@ class DatasetAssessmentManager: :param dataset_name: A name to identify this dataset, optional :return: - a list of dataset attack scores + a list of dataset attack risk scores """ config_gl = DatasetAttackConfigMembershipKnnProbabilities(use_batches=False, generate_plot=self.config.generate_plots) diff --git a/apt/risk/data_assessment/dataset_attack.py b/apt/risk/data_assessment/dataset_attack.py index 9eb9934..728a5f0 100644 --- a/apt/risk/data_assessment/dataset_attack.py +++ b/apt/risk/data_assessment/dataset_attack.py @@ -54,7 +54,7 @@ class DatasetAttack(abc.ABC): """ Assess the privacy of the dataset :return: - score: DatasetAttackScore the privacy attack score + score: DatasetAttackScore the privacy attack risk score """ pass diff --git a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py index 0e7b32b..495d284 100644 --- a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py +++ b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py @@ -42,17 +42,24 @@ class DatasetAttackConfigMembershipKnnProbabilities(Config): @dataclass class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore): - """DatasetAttackMembershipKnnProbabilities privacy score. - Attributes - ---------- - roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack - performance. - average_precision_score: the proportion of predicted members that are correctly members - assessment_type : assessment type is 'MembershipKnnProbabilities', to be used in reports + """DatasetAttackMembershipKnnProbabilities privacy risk score. """ - roc_auc_score: float = -1.0 - average_precision_score: float = -1.0 - assessment_type: str = 'MembershipKnnProbabilities' + roc_auc_score: float + average_precision_score: float + assessment_type: str = 'MembershipKnnProbabilities' # to be used in reports + + def __init__(self, dataset_name: str, roc_auc_score: float, average_precision_score: float, + result: DatasetAttackResultMembership) -> None: + """ + dataset_name: dataset name to be used in reports + roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack + performance. + average_precision_score: the proportion of predicted members that are correctly members + result: the result of the membership inference attack + """ + super().__init__(dataset_name=dataset_name, risk_score=roc_auc_score, result=result) + self.roc_auc_score = roc_auc_score + self.average_precision_score = average_precision_score class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): diff --git a/apt/risk/data_assessment/dataset_attack_result.py b/apt/risk/data_assessment/dataset_attack_result.py index 813e642..55a4e2f 100644 --- a/apt/risk/data_assessment/dataset_attack_result.py +++ b/apt/risk/data_assessment/dataset_attack_result.py @@ -14,7 +14,8 @@ class DatasetAttackResult: @dataclass class DatasetAttackScore: dataset_name: str - result: Optional[DatasetAttackResult] = None + risk_score: float + result: Optional[DatasetAttackResult] @dataclass diff --git a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py index b1ade3b..a57ddf1 100644 --- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -39,15 +39,19 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config): @dataclass class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore): - """DatasetAttackWholeDatasetKnnDistance privacy score. - Attributes - ---------- - share : the share of synthetic records closer to the training than the holdout dataset. - A value of 0.5 or close to it means good privacy. - assessment_type : assessment type is 'WholeDatasetKnnDistance', to be used in reports + """DatasetAttackWholeDatasetKnnDistance privacy risk score. """ - share: float = -1.0 - assessment_type: str = 'WholeDatasetKnnDistance' + share: float + assessment_type: str = 'WholeDatasetKnnDistance' # to be used in reports + + def __init__(self, dataset_name, share) -> None: + """ + dataset_name: dataset name to be used in reports + share : the share of synthetic records closer to the training than the holdout dataset. + A value of 0.5 or close to it means good privacy. + """ + super().__init__(dataset_name=dataset_name, risk_score=share, result=None) + self.share = share class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): diff --git a/tests/test_data_assessment.py b/tests/test_data_assessment.py index 5bbe6ba..b83a382 100644 --- a/tests/test_data_assessment.py +++ b/tests/test_data_assessment.py @@ -14,6 +14,10 @@ from apt.utils.dataset_utils import get_iris_dataset_np, get_diabetes_dataset_np get_nursery_dataset_pd from apt.utils.datasets import ArrayDataset +MIN_SHARE = 0.5 +MIN_ROC_AUC = 0.0 +MIN_PRECISION = 0.0 + NUM_SYNTH_SAMPLES = 40000 NUM_SYNTH_COMPONENTS = 4 @@ -60,12 +64,9 @@ def test_risk_anonymization(name, data, dataset_type, k, mgr): original_data_members = ArrayDataset(preprocessed_x_train, y_train) original_data_non_members = ArrayDataset(preprocessed_x_test, y_test) - [score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, anonymized_data, - f'anon_k{k}_{name}') - assert (score_g.roc_auc_score > 0.5) - assert (score_g.average_precision_score > 0.5) - - assert (score_h.share > 0.5) + dataset_name = f'anon_k{k}_{name}' + assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, anonymized_data, + dataset_name) testdata = [('iris_np', iris_dataset_np, 'np', mgr), @@ -96,13 +97,8 @@ def test_risk_kde(name, data, dataset_type, mgr): original_data_members = ArrayDataset(encoded, y_train) original_data_non_members = ArrayDataset(encoded_test, y_test) - [score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, synth_data, - 'kde' + str(NUM_SYNTH_SAMPLES) + name) - - assert (score_g.roc_auc_score > 0.5) - assert (score_g.average_precision_score > 0.5) - - assert (score_h.share > 0.5) + dataset_name = 'kde' + str(NUM_SYNTH_SAMPLES) + name + assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, synth_data, dataset_name) def kde(n_samples, n_components, original_data): @@ -166,3 +162,12 @@ def preprocess_nursery_x_data(x_train, x_test): encoded = preprocessor.fit_transform(x_train) encoded_test = preprocessor.fit_transform(x_test) return encoded, encoded_test + + +def assess_privacy_and_validate_result(dataset_assessment_manager, original_data_members, original_data_non_members, + synth_data, dataset_name): + [score_g, score_h] = dataset_assessment_manager.assess(original_data_members, original_data_non_members, synth_data, + dataset_name) + assert (score_g.roc_auc_score > MIN_ROC_AUC) + assert (score_g.average_precision_score > MIN_PRECISION) + assert (score_h.share > MIN_SHARE)