Address review comments - make dataset_name optional, fix filename. Fix score serialization - don't serialize the result.

Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
Maya Anderson 2023-03-09 22:38:39 +02:00
parent 3ae64054f8
commit a122976807
6 changed files with 73 additions and 58 deletions

View file

@ -1,12 +1,15 @@
from dataclasses import dataclass
from __future__ import annotations
from typing import Optional
from dataclasses import dataclass
import pandas as pd
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME
from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \
DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \
DatasetAttackScoreWholeDatasetKnnDistance
from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import \
from apt.risk.data_assessment.dataset_attack_per_record_knn_probabilities import \
DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \
DatasetAttackScorePerRecordKnnProbabilities
from apt.utils.datasets import ArrayDataset
@ -22,8 +25,8 @@ class DatasetAssessmentManager:
"""
The main class for running dataset assessment attacks.
"""
attack_scores_per_record_knn_probabilities = []
attack_scores_whole_dataset_knn_distance = []
attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = []
attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = []
def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None:
"""
@ -32,27 +35,39 @@ class DatasetAssessmentManager:
self.config = config
def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> (
DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance):
synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME) -> list[DatasetAttackScore]:
"""
Do dataset assessment by running dataset attacks, and return their scores.
:param original_data_members: A container for the training original samples and labels,
only samples are used in the assessment
:param original_data_non_members: A container for the holdout original samples and labels,
only samples are used in the assessment
:param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
:param dataset_name: A name to identify this dataset, optional
:return:
a list of dataset attack scores
"""
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False,
generate_plot=self.config.generate_plots)
mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
original_data_non_members,
synthetic_data,
dataset_name,
config_gl)
config_gl,
dataset_name)
score_g = mgr.assess_privacy()
self.attack_scores_per_record_knn_probabilities.append(score_g)
config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)
mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data,
dataset_name,
config_h)
config_h,
dataset_name)
score_h = mgr_h.assess_privacy()
self.attack_scores_whole_dataset_knn_distance.append(score_h)
return score_g, score_h
return [score_g, score_h]
def dump_all_scores_to_files(self):
if self.config.persist_reports:
@ -63,6 +78,6 @@ class DatasetAssessmentManager:
"whole_dataset_knn_distance" + results_log_file, True)
@staticmethod
def dump_scores_to_file(attack_scores, filename, header: bool):
run_results_df = pd.DataFrame(attack_scores)
def dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool):
run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore') # don't serialize result
run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w') # Overwrite

View file

@ -29,26 +29,26 @@ class DatasetAttack(abc.ABC):
"""
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str, attack_strategy_utils: AttackStrategyUtils,
config: Optional[Config] = Config()) -> None:
synthetic_data: ArrayDataset, config: Config, dataset_name: str,
attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None:
"""
:param original_data_members: A container for the training original samples and labels,
only samples are used in the assessment
:param original_data_non_members: A container for the holdout original samples and labels,
only samples are used in the assessment
:param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
:param dataset_name: A name to identify the dataset under attack
:param attack_strategy_utils: Utils for use with the attack strategy
:param config: Configuration parameters to guide the assessment process such as which attack
frameworks to use, optional
frameworks to use
:param dataset_name: A name to identify the dataset under attack, optional
:param attack_strategy_utils: Utils for use with the attack strategy, optional
"""
self.original_data_members = original_data_members
self.original_data_non_members = original_data_non_members
self.synthetic_data = synthetic_data
self.dataset_name = dataset_name
self.attack_strategy_utils = attack_strategy_utils
self.config = config
self.attack_strategy_utils = attack_strategy_utils
self.dataset_name = dataset_name
@abc.abstractmethod
def assess_privacy(self) -> DatasetAttackScore:

View file

@ -5,7 +5,7 @@ published in Proceedings of the 2020 ACM SIGSAC Conference on Computer and Commu
https://doi.org/10.1145/3372297.3417238 and its implementation in https://github.com/DingfanChen/GAN-Leaks.
"""
from dataclasses import dataclass
from typing import Optional, Callable
from typing import Callable
import numpy as np
from sklearn.neighbors import NearestNeighbors
@ -13,7 +13,7 @@ from sklearn.neighbors import NearestNeighbors
from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
from apt.risk.data_assessment.dataset_attack import DatasetAttackPerRecord, Config
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \
DatasetAttackScoreWithResult
DatasetAttackScoreWithResult, DEFAULT_DATASET_NAME
from apt.utils.datasets import ArrayDataset
@ -65,19 +65,19 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
"""
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str,
config: Optional[
DatasetAttackConfigPerRecordKnnProbabilities] = DatasetAttackConfigPerRecordKnnProbabilities()):
synthetic_data: ArrayDataset,
config: DatasetAttackConfigPerRecordKnnProbabilities = DatasetAttackConfigPerRecordKnnProbabilities(),
dataset_name: str = DEFAULT_DATASET_NAME):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
:param synthetic_data: A container for the synthetic samples and labels
:param dataset_name: A name to identify this dataset
:param config: Configuration parameters to guide the attack, optional
:param dataset_name: A name to identify this dataset, optional
"""
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
attack_strategy_utils, config)
super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
attack_strategy_utils)
if config.compute_distance:
self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto', metric=config.compute_distance,
metric_params=config.distance_params)
@ -91,15 +91,15 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
The assumption is that since the generative model is trained to approximate the training data distribution
then the probability of a sample to be a member of the training data should be proportional to the probability
that the query sample can be generated by the generative model.
The assumption is that if the probability that the query sample is generated by the generative model is large,
So, if the probability that the query sample is generated by the generative model is large,
it is more likely that the query sample was used to train the generative model. This probability is approximated
by the Parzen window density estimation in 'probability_per_sample()', computed from the NN distances from the
query samples to the synthetic data samples.
:return
:score Privacy score of the attack together with the attack result with the probabilities of positive and
negative samples to be generated by the synthetic data generator based on the NN distances from the
query samples to the synthetic data samples
:return:
Privacy score of the attack together with the attack result with the probabilities of positive and
negative samples to be generated by the synthetic data generator based on the NN distances from the
query samples to the synthetic data samples
"""
# nearest neighbor search
self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data)
@ -112,7 +112,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
neg_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_non_members,
self.probability_per_sample)
result = DatasetAttackResultPerRecord(self.dataset_name, positive_probabilities=pos_proba,
result = DatasetAttackResultPerRecord(positive_probabilities=pos_proba,
negative_probabilities=neg_proba)
score = self.calculate_privacy_score(result, self.config.generate_plot)
@ -126,8 +126,8 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
:param dataset_attack_result attack result containing probabilities of positive and negative samples to be
generated by the synthetic data generator
:param generate_plot generate AUC ROC curve plot and persist it
:return
:score of the attack, based on distance-based probabilities - mainly the ROC AUC score
:return:
score of the attack, based on distance-based probabilities - mainly the ROC AUC score
"""
pos_proba, neg_proba = \
dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities
@ -145,7 +145,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
For every sample represented by its distance from the query sample to its KNN in synthetic data,
computes the probability of the synthetic data to be part of the query dataset.
:param distances: distance between every query sample in batch to its KNNs among synthetic samples
:return
distances: probability estimates of the query samples being generated and so being part of the synthetic set
:return:
probability estimates of the query samples being generated and so - of being part of the synthetic set
"""
return np.average(np.exp(-distances), axis=1)

View file

@ -3,6 +3,8 @@ from dataclasses import dataclass, field
import numpy as np
DEFAULT_DATASET_NAME = "dataset"
@dataclass
class DatasetAttackScore:
dataset_name: str
@ -10,10 +12,10 @@ class DatasetAttackScore:
@dataclass
class DatasetAttackResult:
dataset_name: str
pass
@dataclass
@dataclass(repr=False)
class DatasetAttackScoreWithResult(DatasetAttackScore):
result: DatasetAttackResult = field(repr=False)

View file

@ -6,14 +6,13 @@ and "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data"
and on a variation of its reference implementation in https://github.com/mostly-ai/paper-fidelity-accuracy.
"""
from dataclasses import dataclass
from typing import Optional
import numpy as np
from sklearn.neighbors import NearestNeighbors
from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
from apt.risk.data_assessment.dataset_attack import Config, DatasetAttack
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME
from apt.utils.datasets import ArrayDataset
K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest neighbor.
@ -40,7 +39,7 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config):
@dataclass
class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
"""DatasetAttackWholeDatasetKnnDistance privacy score.
Attributes
----------
share : the share of synthetic records closer to the training than the holdout dataset.
@ -61,20 +60,19 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
"""
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str,
config: Optional[
DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
synthetic_data: ArrayDataset,
config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(),
dataset_name: str = DEFAULT_DATASET_NAME):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
:param synthetic_data: A container for the synthetic samples and labels
:param dataset_name: A name to identify this dataset
:param config: Configuration parameters to guide the assessment process such as which attack
frameworks to use, optional
:param config: Configuration parameters to guide the assessment process, optional
:param dataset_name: A name to identify this dataset, optional
"""
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
attack_strategy_utils, config)
super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
attack_strategy_utils)
if config.compute_distance:
self.knn_learner_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance,
metric_params=config.distance_params)
@ -89,7 +87,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
DCR computed by 'calculate_distances()'.
:return:
:score of the attack, based on the NN distances from the query samples to the synthetic data samples
score of the attack, based on the NN distances from the query samples to the synthetic data samples
"""
member_distances, non_member_distances = self.calculate_distances()
# distance of the synth. records to members and to non-members
@ -111,8 +109,8 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks."
:return:
pos_distances: distances of each synthetic data member from its nearest training sample
neg_distances: distances of each synthetic data member from its nearest validation sample
pos_distances - distances of each synthetic data member from its nearest training sample
neg_distances - distances of each synthetic data member from its nearest validation sample
"""
# nearest neighbor search
self.attack_strategy_utils.fit(self.knn_learner_members, self.original_data_members)

View file

@ -60,8 +60,8 @@ def test_risk_anonymization(name, data, dataset_type, k, mgr):
original_data_members = ArrayDataset(preprocessed_x_train, y_train)
original_data_non_members = ArrayDataset(preprocessed_x_test, y_test)
score_g, score_h = mgr.assess(original_data_members, original_data_non_members, anonymized_data,
f'anon_k{k}_{name}')
[score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, anonymized_data,
f'anon_k{k}_{name}')
assert (score_g.roc_auc_score > 0.5)
assert (score_g.average_precision_score > 0.5)
@ -96,8 +96,8 @@ def test_risk_kde(name, data, dataset_type, mgr):
original_data_members = ArrayDataset(encoded, y_train)
original_data_non_members = ArrayDataset(encoded_test, y_test)
score_g, score_h = mgr.assess(original_data_members, original_data_non_members, synth_data,
'kde' + str(NUM_SYNTH_SAMPLES) + name)
[score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, synth_data,
'kde' + str(NUM_SYNTH_SAMPLES) + name)
assert (score_g.roc_auc_score > 0.5)
assert (score_g.average_precision_score > 0.5)