diff --git a/apt/risk/data_assessment/__init__.py b/apt/risk/data_assessment/__init__.py index 330fc94..071a691 100644 --- a/apt/risk/data_assessment/__init__.py +++ b/apt/risk/data_assessment/__init__.py @@ -3,7 +3,9 @@ Module providing privacy risk assessment for synthetic data. The main interface, ``DatasetAttack``, with the assess_privacy() main method assumes the availability of the training data, holdout data and synthetic data at the time of the privacy evaluation. -It is implemented by two types of abstract classes: ``DatasetAttackPerRecord`` and ``DatasetAttackWhole``, to be -implemented by concrete assessment methods. +It is to be implemented by concrete assessment methods, which can run the assessment on a per-record level, +or on the whole dataset. +The abstract class ``DatasetAttackPerRecord`` implements the ``DatasetAttack`` interface, but adds the result +of the attack, so that the final score contains both the result for further analysis and the calculated score. """ from apt.risk.data_assessment import dataset_attack diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py index d33b562..a453134 100644 --- a/apt/risk/data_assessment/dataset_assessment_manager.py +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -34,15 +34,15 @@ class DatasetAssessmentManager: def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> ( DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance): - config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False) + config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False, + generate_plot=self.config.generate_plots) mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members, original_data_non_members, synthetic_data, dataset_name, config_gl) - result = mgr.assess_privacy() - score_g = mgr.calculate_privacy_score(result, generate_plot=self.config.generate_plots) + score_g = mgr.assess_privacy() self.attack_scores_per_record_knn_probabilities.append(score_g) config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False) diff --git a/apt/risk/data_assessment/dataset_attack.py b/apt/risk/data_assessment/dataset_attack.py index 61a0dde..9d2904a 100644 --- a/apt/risk/data_assessment/dataset_attack.py +++ b/apt/risk/data_assessment/dataset_attack.py @@ -2,7 +2,7 @@ This module defines the interface for privacy risk assessment of synthetic datasets. """ import abc -from typing import Optional, Union +from typing import Optional import matplotlib.pyplot as plt import numpy as np @@ -10,8 +10,7 @@ from sklearn import metrics from sklearn.metrics import RocCurveDisplay from apt.risk.data_assessment.attack_strategy_utils import AttackStrategyUtils -from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \ - DatasetAttackResult +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord from apt.utils.datasets import ArrayDataset @@ -52,12 +51,11 @@ class DatasetAttack(abc.ABC): self.config = config @abc.abstractmethod - def assess_privacy(self) -> Union[DatasetAttackScore, DatasetAttackResult]: + def assess_privacy(self) -> DatasetAttackScore: """ Assess the privacy of the dataset :return: - result: Union[DatasetAttackScore, DatasetAttackResult] can be either the final privacy attack score, - or an intermediate attack result, which can be translated into a privacy score if needed + score: DatasetAttackScore the privacy attack score """ pass @@ -67,22 +65,13 @@ class DatasetAttackPerRecord(DatasetAttack): An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level. """ - @abc.abstractmethod - def assess_privacy(self) -> DatasetAttackResultPerRecord: - """ - Assess the privacy of the dataset - :return: - result: DatasetAttackResultPerRecord - """ - pass - @abc.abstractmethod def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord, generate_plot=False) -> DatasetAttackScore: """ - Calculate dataset privacy score based on the result of the privacy assessment + Calculate dataset privacy score based on the result of the privacy attack :return: - result: DatasetAttackScore + score: DatasetAttackScore """ pass @@ -120,18 +109,3 @@ class DatasetAttackPerRecord(DatasetAttack): auc = metrics.roc_auc_score(labels, results) ap = metrics.average_precision_score(labels, results) return fpr, tpr, threshold, auc, ap - - -class DatasetAttackWhole(DatasetAttack): - """ - An abstract base class for performing privacy risk assessment for synthetic datasets on a whole-dataset level. - """ - - @abc.abstractmethod - def assess_privacy(self) -> DatasetAttackScore: - """ - Assess the privacy of the dataset - :return: - result: DatasetAttackScore - """ - pass diff --git a/apt/risk/data_assessment/dataset_attack_result.py b/apt/risk/data_assessment/dataset_attack_result.py index d64d040..530709b 100644 --- a/apt/risk/data_assessment/dataset_attack_result.py +++ b/apt/risk/data_assessment/dataset_attack_result.py @@ -1,19 +1,24 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field import numpy as np +@dataclass +class DatasetAttackScore: + dataset_name: str + + @dataclass class DatasetAttackResult: dataset_name: str @dataclass -class DatasetAttackResultPerRecord(DatasetAttackResult): - positive_probabilities: np.ndarray - negative_probabilities: np.ndarray +class DatasetAttackScoreWithResult(DatasetAttackScore): + result: DatasetAttackResult = field(repr=False) @dataclass -class DatasetAttackScore: - dataset_name: str +class DatasetAttackResultPerRecord(DatasetAttackResult): + positive_probabilities: np.ndarray + negative_probabilities: np.ndarray diff --git a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py index ddae72a..f0d52fd 100644 --- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -12,7 +12,7 @@ import numpy as np from sklearn.neighbors import NearestNeighbors from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils -from apt.risk.data_assessment.dataset_attack import DatasetAttackWhole, Config +from apt.risk.data_assessment.dataset_attack import Config, DatasetAttack from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore from apt.utils.datasets import ArrayDataset @@ -51,7 +51,7 @@ class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore): assessment_type: str = 'WholeDatasetKnnDistance' -class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole): +class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): """ Privacy risk assessment for synthetic datasets based on distances of synthetic data records from members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic @@ -89,7 +89,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole): Calculate the share of synthetic records closer to the training than the holdout dataset, based on the DCR computed by 'calculate_distances()'. :return: - :result of the attack, based on the NN distances from the query samples to the synthetic data samples + :score of the attack, based on the NN distances from the query samples to the synthetic data samples """ member_distances, non_member_distances = self.calculate_distances() # distance of the synth. records to members and to non-members diff --git a/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py b/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py index 057961d..c7aad2f 100644 --- a/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py +++ b/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py @@ -12,7 +12,8 @@ from sklearn.neighbors import NearestNeighbors from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils from apt.risk.data_assessment.dataset_attack import DatasetAttackPerRecord, Config -from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \ + DatasetAttackScoreWithResult from apt.utils.datasets import ArrayDataset @@ -29,16 +30,18 @@ class DatasetAttackConfigPerRecordKnnProbabilities(Config): See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in sklearn.neighbors.NearestNeighbors documentation. + generate_plot: Generate or not an AUR ROC curve and persist it in a file """ k: int = 5 use_batches: bool = False batch_size: int = 10 compute_distance: Callable = None distance_params: dict = None + generate_plot: bool = False @dataclass -class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScore): +class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScoreWithResult): """DatasetAttackPerRecordKnnProbabilities privacy score. Attributes ---------- @@ -81,7 +84,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): else: self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto') - def assess_privacy(self) -> DatasetAttackResultPerRecord: + def assess_privacy(self) -> DatasetAttackScorePerRecordKnnProbabilities: """ Membership Inference Attack which calculates probabilities of positive and negative samples to be generated by the synthetic data generator. @@ -94,8 +97,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): query samples to the synthetic data samples. :return - :result of the attack with the probabilities of positive and negative samples to be generated by the - synthetic data generator based on the NN distances from the query samples to the synthetic data samples + :score Privacy score of the attack together with the attack result with the probabilities of positive and + negative samples to be generated by the synthetic data generator based on the NN distances from the + query samples to the synthetic data samples """ # nearest neighbor search self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data) @@ -110,7 +114,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): result = DatasetAttackResultPerRecord(self.dataset_name, positive_probabilities=pos_proba, negative_probabilities=neg_proba) - return result + + score = self.calculate_privacy_score(result, self.config.generate_plot) + return score def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord, generate_plot=False) -> DatasetAttackScore: @@ -126,8 +132,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): pos_proba, neg_proba = \ dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities fpr, tpr, threshold, auc, ap = self.calculate_metrics(pos_proba, neg_proba) - score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name, roc_auc_score=auc, - average_precision_score=ap) + score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name, + result=dataset_attack_result, + roc_auc_score=auc, average_precision_score=ap) if generate_plot: self.plot_roc_curve(pos_proba, neg_proba) return score