diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py index a453134..310b44e 100644 --- a/apt/risk/data_assessment/dataset_assessment_manager.py +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -1,12 +1,15 @@ -from dataclasses import dataclass +from __future__ import annotations from typing import Optional +from dataclasses import dataclass + import pandas as pd +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \ DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \ DatasetAttackScoreWholeDatasetKnnDistance -from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import \ +from apt.risk.data_assessment.dataset_attack_per_record_knn_probabilities import \ DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \ DatasetAttackScorePerRecordKnnProbabilities from apt.utils.datasets import ArrayDataset @@ -22,8 +25,8 @@ class DatasetAssessmentManager: """ The main class for running dataset assessment attacks. """ - attack_scores_per_record_knn_probabilities = [] - attack_scores_whole_dataset_knn_distance = [] + attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = [] + attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = [] def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None: """ @@ -32,27 +35,39 @@ class DatasetAssessmentManager: self.config = config def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, - synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> ( - DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance): + synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME) -> list[DatasetAttackScore]: + """ + Do dataset assessment by running dataset attacks, and return their scores. + + :param original_data_members: A container for the training original samples and labels, + only samples are used in the assessment + :param original_data_non_members: A container for the holdout original samples and labels, + only samples are used in the assessment + :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment + :param dataset_name: A name to identify this dataset, optional + + :return: + a list of dataset attack scores + """ config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False, generate_plot=self.config.generate_plots) mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members, original_data_non_members, synthetic_data, - dataset_name, - config_gl) + config_gl, + dataset_name) score_g = mgr.assess_privacy() self.attack_scores_per_record_knn_probabilities.append(score_g) config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False) mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data, - dataset_name, - config_h) + config_h, + dataset_name) score_h = mgr_h.assess_privacy() self.attack_scores_whole_dataset_knn_distance.append(score_h) - return score_g, score_h + return [score_g, score_h] def dump_all_scores_to_files(self): if self.config.persist_reports: @@ -63,6 +78,6 @@ class DatasetAssessmentManager: "whole_dataset_knn_distance" + results_log_file, True) @staticmethod - def dump_scores_to_file(attack_scores, filename, header: bool): - run_results_df = pd.DataFrame(attack_scores) + def dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool): + run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore') # don't serialize result run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w') # Overwrite diff --git a/apt/risk/data_assessment/dataset_attack.py b/apt/risk/data_assessment/dataset_attack.py index 9d2904a..f92f5ce 100644 --- a/apt/risk/data_assessment/dataset_attack.py +++ b/apt/risk/data_assessment/dataset_attack.py @@ -29,26 +29,26 @@ class DatasetAttack(abc.ABC): """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, - synthetic_data: ArrayDataset, dataset_name: str, attack_strategy_utils: AttackStrategyUtils, - config: Optional[Config] = Config()) -> None: + synthetic_data: ArrayDataset, config: Config, dataset_name: str, + attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None: """ :param original_data_members: A container for the training original samples and labels, only samples are used in the assessment :param original_data_non_members: A container for the holdout original samples and labels, only samples are used in the assessment :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment - :param dataset_name: A name to identify the dataset under attack - :param attack_strategy_utils: Utils for use with the attack strategy :param config: Configuration parameters to guide the assessment process such as which attack - frameworks to use, optional + frameworks to use + :param dataset_name: A name to identify the dataset under attack, optional + :param attack_strategy_utils: Utils for use with the attack strategy, optional """ self.original_data_members = original_data_members self.original_data_non_members = original_data_non_members self.synthetic_data = synthetic_data - self.dataset_name = dataset_name - self.attack_strategy_utils = attack_strategy_utils self.config = config + self.attack_strategy_utils = attack_strategy_utils + self.dataset_name = dataset_name @abc.abstractmethod def assess_privacy(self) -> DatasetAttackScore: diff --git a/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py b/apt/risk/data_assessment/dataset_attack_per_record_knn_probabilities.py similarity index 85% rename from apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py rename to apt/risk/data_assessment/dataset_attack_per_record_knn_probabilities.py index c7aad2f..5fddb81 100644 --- a/apt/risk/data_assessment/per_record_knn_probabilities_dataset_attack_.py +++ b/apt/risk/data_assessment/dataset_attack_per_record_knn_probabilities.py @@ -5,7 +5,7 @@ published in Proceedings of the 2020 ACM SIGSAC Conference on Computer and Commu https://doi.org/10.1145/3372297.3417238 and its implementation in https://github.com/DingfanChen/GAN-Leaks. """ from dataclasses import dataclass -from typing import Optional, Callable +from typing import Callable import numpy as np from sklearn.neighbors import NearestNeighbors @@ -13,7 +13,7 @@ from sklearn.neighbors import NearestNeighbors from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils from apt.risk.data_assessment.dataset_attack import DatasetAttackPerRecord, Config from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \ - DatasetAttackScoreWithResult + DatasetAttackScoreWithResult, DEFAULT_DATASET_NAME from apt.utils.datasets import ArrayDataset @@ -65,19 +65,19 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, - synthetic_data: ArrayDataset, dataset_name: str, - config: Optional[ - DatasetAttackConfigPerRecordKnnProbabilities] = DatasetAttackConfigPerRecordKnnProbabilities()): + synthetic_data: ArrayDataset, + config: DatasetAttackConfigPerRecordKnnProbabilities = DatasetAttackConfigPerRecordKnnProbabilities(), + dataset_name: str = DEFAULT_DATASET_NAME): """ :param original_data_members: A container for the training original samples and labels :param original_data_non_members: A container for the holdout original samples and labels :param synthetic_data: A container for the synthetic samples and labels - :param dataset_name: A name to identify this dataset :param config: Configuration parameters to guide the attack, optional + :param dataset_name: A name to identify this dataset, optional """ attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) - super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name, - attack_strategy_utils, config) + super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name, + attack_strategy_utils) if config.compute_distance: self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto', metric=config.compute_distance, metric_params=config.distance_params) @@ -91,15 +91,15 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): The assumption is that since the generative model is trained to approximate the training data distribution then the probability of a sample to be a member of the training data should be proportional to the probability that the query sample can be generated by the generative model. - The assumption is that if the probability that the query sample is generated by the generative model is large, + So, if the probability that the query sample is generated by the generative model is large, it is more likely that the query sample was used to train the generative model. This probability is approximated by the Parzen window density estimation in 'probability_per_sample()', computed from the NN distances from the query samples to the synthetic data samples. - :return - :score Privacy score of the attack together with the attack result with the probabilities of positive and - negative samples to be generated by the synthetic data generator based on the NN distances from the - query samples to the synthetic data samples + :return: + Privacy score of the attack together with the attack result with the probabilities of positive and + negative samples to be generated by the synthetic data generator based on the NN distances from the + query samples to the synthetic data samples """ # nearest neighbor search self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data) @@ -112,7 +112,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): neg_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_non_members, self.probability_per_sample) - result = DatasetAttackResultPerRecord(self.dataset_name, positive_probabilities=pos_proba, + result = DatasetAttackResultPerRecord(positive_probabilities=pos_proba, negative_probabilities=neg_proba) score = self.calculate_privacy_score(result, self.config.generate_plot) @@ -126,8 +126,8 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): :param dataset_attack_result attack result containing probabilities of positive and negative samples to be generated by the synthetic data generator :param generate_plot generate AUC ROC curve plot and persist it - :return - :score of the attack, based on distance-based probabilities - mainly the ROC AUC score + :return: + score of the attack, based on distance-based probabilities - mainly the ROC AUC score """ pos_proba, neg_proba = \ dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities @@ -145,7 +145,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord): For every sample represented by its distance from the query sample to its KNN in synthetic data, computes the probability of the synthetic data to be part of the query dataset. :param distances: distance between every query sample in batch to its KNNs among synthetic samples - :return - distances: probability estimates of the query samples being generated and so being part of the synthetic set + :return: + probability estimates of the query samples being generated and so - of being part of the synthetic set """ return np.average(np.exp(-distances), axis=1) diff --git a/apt/risk/data_assessment/dataset_attack_result.py b/apt/risk/data_assessment/dataset_attack_result.py index 530709b..98b73d3 100644 --- a/apt/risk/data_assessment/dataset_attack_result.py +++ b/apt/risk/data_assessment/dataset_attack_result.py @@ -3,6 +3,8 @@ from dataclasses import dataclass, field import numpy as np +DEFAULT_DATASET_NAME = "dataset" + @dataclass class DatasetAttackScore: dataset_name: str @@ -10,10 +12,10 @@ class DatasetAttackScore: @dataclass class DatasetAttackResult: - dataset_name: str + pass -@dataclass +@dataclass(repr=False) class DatasetAttackScoreWithResult(DatasetAttackScore): result: DatasetAttackResult = field(repr=False) diff --git a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py index f0d52fd..e6ec16f 100644 --- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -6,14 +6,13 @@ and "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" and on a variation of its reference implementation in https://github.com/mostly-ai/paper-fidelity-accuracy. """ from dataclasses import dataclass -from typing import Optional import numpy as np from sklearn.neighbors import NearestNeighbors from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils from apt.risk.data_assessment.dataset_attack import Config, DatasetAttack -from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME from apt.utils.datasets import ArrayDataset K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest neighbor. @@ -40,7 +39,7 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config): @dataclass class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore): - """Configuration for DatasetAttackWholeDatasetKnnDistance. + """DatasetAttackWholeDatasetKnnDistance privacy score. Attributes ---------- share : the share of synthetic records closer to the training than the holdout dataset. @@ -61,20 +60,19 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, - synthetic_data: ArrayDataset, dataset_name: str, - config: Optional[ - DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()): + synthetic_data: ArrayDataset, + config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(), + dataset_name: str = DEFAULT_DATASET_NAME): """ :param original_data_members: A container for the training original samples and labels :param original_data_non_members: A container for the holdout original samples and labels :param synthetic_data: A container for the synthetic samples and labels - :param dataset_name: A name to identify this dataset - :param config: Configuration parameters to guide the assessment process such as which attack - frameworks to use, optional + :param config: Configuration parameters to guide the assessment process, optional + :param dataset_name: A name to identify this dataset, optional """ attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) - super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name, - attack_strategy_utils, config) + super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name, + attack_strategy_utils) if config.compute_distance: self.knn_learner_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance, metric_params=config.distance_params) @@ -89,7 +87,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): Calculate the share of synthetic records closer to the training than the holdout dataset, based on the DCR computed by 'calculate_distances()'. :return: - :score of the attack, based on the NN distances from the query samples to the synthetic data samples + score of the attack, based on the NN distances from the query samples to the synthetic data samples """ member_distances, non_member_distances = self.calculate_distances() # distance of the synth. records to members and to non-members @@ -111,8 +109,8 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks." :return: - pos_distances: distances of each synthetic data member from its nearest training sample - neg_distances: distances of each synthetic data member from its nearest validation sample + pos_distances - distances of each synthetic data member from its nearest training sample + neg_distances - distances of each synthetic data member from its nearest validation sample """ # nearest neighbor search self.attack_strategy_utils.fit(self.knn_learner_members, self.original_data_members) diff --git a/tests/test_data_assessment.py b/tests/test_data_assessment.py index 902d4bf..5bbe6ba 100644 --- a/tests/test_data_assessment.py +++ b/tests/test_data_assessment.py @@ -60,8 +60,8 @@ def test_risk_anonymization(name, data, dataset_type, k, mgr): original_data_members = ArrayDataset(preprocessed_x_train, y_train) original_data_non_members = ArrayDataset(preprocessed_x_test, y_test) - score_g, score_h = mgr.assess(original_data_members, original_data_non_members, anonymized_data, - f'anon_k{k}_{name}') + [score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, anonymized_data, + f'anon_k{k}_{name}') assert (score_g.roc_auc_score > 0.5) assert (score_g.average_precision_score > 0.5) @@ -96,8 +96,8 @@ def test_risk_kde(name, data, dataset_type, mgr): original_data_members = ArrayDataset(encoded, y_train) original_data_non_members = ArrayDataset(encoded_test, y_test) - score_g, score_h = mgr.assess(original_data_members, original_data_non_members, synth_data, - 'kde' + str(NUM_SYNTH_SAMPLES) + name) + [score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, synth_data, + 'kde' + str(NUM_SYNTH_SAMPLES) + name) assert (score_g.roc_auc_score > 0.5) assert (score_g.average_precision_score > 0.5)