mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
Address review comments - make dataset_name optional, fix filename. Fix score serialization - don't serialize the result.
Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
parent
3ae64054f8
commit
a122976807
6 changed files with 73 additions and 58 deletions
|
|
@ -1,12 +1,15 @@
|
|||
from dataclasses import dataclass
|
||||
from __future__ import annotations
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME
|
||||
from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \
|
||||
DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \
|
||||
DatasetAttackScoreWholeDatasetKnnDistance
|
||||
from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import \
|
||||
from apt.risk.data_assessment.dataset_attack_per_record_knn_probabilities import \
|
||||
DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \
|
||||
DatasetAttackScorePerRecordKnnProbabilities
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
|
|
@ -22,8 +25,8 @@ class DatasetAssessmentManager:
|
|||
"""
|
||||
The main class for running dataset assessment attacks.
|
||||
"""
|
||||
attack_scores_per_record_knn_probabilities = []
|
||||
attack_scores_whole_dataset_knn_distance = []
|
||||
attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = []
|
||||
attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = []
|
||||
|
||||
def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None:
|
||||
"""
|
||||
|
|
@ -32,27 +35,39 @@ class DatasetAssessmentManager:
|
|||
self.config = config
|
||||
|
||||
def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> (
|
||||
DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance):
|
||||
synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME) -> list[DatasetAttackScore]:
|
||||
"""
|
||||
Do dataset assessment by running dataset attacks, and return their scores.
|
||||
|
||||
:param original_data_members: A container for the training original samples and labels,
|
||||
only samples are used in the assessment
|
||||
:param original_data_non_members: A container for the holdout original samples and labels,
|
||||
only samples are used in the assessment
|
||||
:param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
|
||||
:return:
|
||||
a list of dataset attack scores
|
||||
"""
|
||||
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False,
|
||||
generate_plot=self.config.generate_plots)
|
||||
mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
|
||||
original_data_non_members,
|
||||
synthetic_data,
|
||||
dataset_name,
|
||||
config_gl)
|
||||
config_gl,
|
||||
dataset_name)
|
||||
|
||||
score_g = mgr.assess_privacy()
|
||||
self.attack_scores_per_record_knn_probabilities.append(score_g)
|
||||
|
||||
config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)
|
||||
mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data,
|
||||
dataset_name,
|
||||
config_h)
|
||||
config_h,
|
||||
dataset_name)
|
||||
|
||||
score_h = mgr_h.assess_privacy()
|
||||
self.attack_scores_whole_dataset_knn_distance.append(score_h)
|
||||
return score_g, score_h
|
||||
return [score_g, score_h]
|
||||
|
||||
def dump_all_scores_to_files(self):
|
||||
if self.config.persist_reports:
|
||||
|
|
@ -63,6 +78,6 @@ class DatasetAssessmentManager:
|
|||
"whole_dataset_knn_distance" + results_log_file, True)
|
||||
|
||||
@staticmethod
|
||||
def dump_scores_to_file(attack_scores, filename, header: bool):
|
||||
run_results_df = pd.DataFrame(attack_scores)
|
||||
def dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool):
|
||||
run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore') # don't serialize result
|
||||
run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w') # Overwrite
|
||||
|
|
|
|||
|
|
@ -29,26 +29,26 @@ class DatasetAttack(abc.ABC):
|
|||
"""
|
||||
|
||||
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset, dataset_name: str, attack_strategy_utils: AttackStrategyUtils,
|
||||
config: Optional[Config] = Config()) -> None:
|
||||
synthetic_data: ArrayDataset, config: Config, dataset_name: str,
|
||||
attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None:
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels,
|
||||
only samples are used in the assessment
|
||||
:param original_data_non_members: A container for the holdout original samples and labels,
|
||||
only samples are used in the assessment
|
||||
:param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
|
||||
:param dataset_name: A name to identify the dataset under attack
|
||||
:param attack_strategy_utils: Utils for use with the attack strategy
|
||||
:param config: Configuration parameters to guide the assessment process such as which attack
|
||||
frameworks to use, optional
|
||||
frameworks to use
|
||||
:param dataset_name: A name to identify the dataset under attack, optional
|
||||
:param attack_strategy_utils: Utils for use with the attack strategy, optional
|
||||
"""
|
||||
|
||||
self.original_data_members = original_data_members
|
||||
self.original_data_non_members = original_data_non_members
|
||||
self.synthetic_data = synthetic_data
|
||||
self.dataset_name = dataset_name
|
||||
self.attack_strategy_utils = attack_strategy_utils
|
||||
self.config = config
|
||||
self.attack_strategy_utils = attack_strategy_utils
|
||||
self.dataset_name = dataset_name
|
||||
|
||||
@abc.abstractmethod
|
||||
def assess_privacy(self) -> DatasetAttackScore:
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ published in Proceedings of the 2020 ACM SIGSAC Conference on Computer and Commu
|
|||
https://doi.org/10.1145/3372297.3417238 and its implementation in https://github.com/DingfanChen/GAN-Leaks.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Callable
|
||||
from typing import Callable
|
||||
|
||||
import numpy as np
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
|
|
@ -13,7 +13,7 @@ from sklearn.neighbors import NearestNeighbors
|
|||
from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
|
||||
from apt.risk.data_assessment.dataset_attack import DatasetAttackPerRecord, Config
|
||||
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \
|
||||
DatasetAttackScoreWithResult
|
||||
DatasetAttackScoreWithResult, DEFAULT_DATASET_NAME
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
|
||||
|
||||
|
|
@ -65,19 +65,19 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
|
|||
"""
|
||||
|
||||
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset, dataset_name: str,
|
||||
config: Optional[
|
||||
DatasetAttackConfigPerRecordKnnProbabilities] = DatasetAttackConfigPerRecordKnnProbabilities()):
|
||||
synthetic_data: ArrayDataset,
|
||||
config: DatasetAttackConfigPerRecordKnnProbabilities = DatasetAttackConfigPerRecordKnnProbabilities(),
|
||||
dataset_name: str = DEFAULT_DATASET_NAME):
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels
|
||||
:param original_data_non_members: A container for the holdout original samples and labels
|
||||
:param synthetic_data: A container for the synthetic samples and labels
|
||||
:param dataset_name: A name to identify this dataset
|
||||
:param config: Configuration parameters to guide the attack, optional
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
"""
|
||||
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
|
||||
super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
|
||||
attack_strategy_utils, config)
|
||||
super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
|
||||
attack_strategy_utils)
|
||||
if config.compute_distance:
|
||||
self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto', metric=config.compute_distance,
|
||||
metric_params=config.distance_params)
|
||||
|
|
@ -91,15 +91,15 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
|
|||
The assumption is that since the generative model is trained to approximate the training data distribution
|
||||
then the probability of a sample to be a member of the training data should be proportional to the probability
|
||||
that the query sample can be generated by the generative model.
|
||||
The assumption is that if the probability that the query sample is generated by the generative model is large,
|
||||
So, if the probability that the query sample is generated by the generative model is large,
|
||||
it is more likely that the query sample was used to train the generative model. This probability is approximated
|
||||
by the Parzen window density estimation in 'probability_per_sample()', computed from the NN distances from the
|
||||
query samples to the synthetic data samples.
|
||||
|
||||
:return
|
||||
:score Privacy score of the attack together with the attack result with the probabilities of positive and
|
||||
negative samples to be generated by the synthetic data generator based on the NN distances from the
|
||||
query samples to the synthetic data samples
|
||||
:return:
|
||||
Privacy score of the attack together with the attack result with the probabilities of positive and
|
||||
negative samples to be generated by the synthetic data generator based on the NN distances from the
|
||||
query samples to the synthetic data samples
|
||||
"""
|
||||
# nearest neighbor search
|
||||
self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data)
|
||||
|
|
@ -112,7 +112,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
|
|||
neg_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_non_members,
|
||||
self.probability_per_sample)
|
||||
|
||||
result = DatasetAttackResultPerRecord(self.dataset_name, positive_probabilities=pos_proba,
|
||||
result = DatasetAttackResultPerRecord(positive_probabilities=pos_proba,
|
||||
negative_probabilities=neg_proba)
|
||||
|
||||
score = self.calculate_privacy_score(result, self.config.generate_plot)
|
||||
|
|
@ -126,8 +126,8 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
|
|||
:param dataset_attack_result attack result containing probabilities of positive and negative samples to be
|
||||
generated by the synthetic data generator
|
||||
:param generate_plot generate AUC ROC curve plot and persist it
|
||||
:return
|
||||
:score of the attack, based on distance-based probabilities - mainly the ROC AUC score
|
||||
:return:
|
||||
score of the attack, based on distance-based probabilities - mainly the ROC AUC score
|
||||
"""
|
||||
pos_proba, neg_proba = \
|
||||
dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities
|
||||
|
|
@ -145,7 +145,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
|
|||
For every sample represented by its distance from the query sample to its KNN in synthetic data,
|
||||
computes the probability of the synthetic data to be part of the query dataset.
|
||||
:param distances: distance between every query sample in batch to its KNNs among synthetic samples
|
||||
:return
|
||||
distances: probability estimates of the query samples being generated and so being part of the synthetic set
|
||||
:return:
|
||||
probability estimates of the query samples being generated and so - of being part of the synthetic set
|
||||
"""
|
||||
return np.average(np.exp(-distances), axis=1)
|
||||
|
|
@ -3,6 +3,8 @@ from dataclasses import dataclass, field
|
|||
import numpy as np
|
||||
|
||||
|
||||
DEFAULT_DATASET_NAME = "dataset"
|
||||
|
||||
@dataclass
|
||||
class DatasetAttackScore:
|
||||
dataset_name: str
|
||||
|
|
@ -10,10 +12,10 @@ class DatasetAttackScore:
|
|||
|
||||
@dataclass
|
||||
class DatasetAttackResult:
|
||||
dataset_name: str
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(repr=False)
|
||||
class DatasetAttackScoreWithResult(DatasetAttackScore):
|
||||
result: DatasetAttackResult = field(repr=False)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,14 +6,13 @@ and "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data"
|
|||
and on a variation of its reference implementation in https://github.com/mostly-ai/paper-fidelity-accuracy.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
|
||||
from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
|
||||
from apt.risk.data_assessment.dataset_attack import Config, DatasetAttack
|
||||
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore
|
||||
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
|
||||
K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest neighbor.
|
||||
|
|
@ -40,7 +39,7 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config):
|
|||
|
||||
@dataclass
|
||||
class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
|
||||
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
|
||||
"""DatasetAttackWholeDatasetKnnDistance privacy score.
|
||||
Attributes
|
||||
----------
|
||||
share : the share of synthetic records closer to the training than the holdout dataset.
|
||||
|
|
@ -61,20 +60,19 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
|
|||
"""
|
||||
|
||||
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset, dataset_name: str,
|
||||
config: Optional[
|
||||
DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
|
||||
synthetic_data: ArrayDataset,
|
||||
config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(),
|
||||
dataset_name: str = DEFAULT_DATASET_NAME):
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels
|
||||
:param original_data_non_members: A container for the holdout original samples and labels
|
||||
:param synthetic_data: A container for the synthetic samples and labels
|
||||
:param dataset_name: A name to identify this dataset
|
||||
:param config: Configuration parameters to guide the assessment process such as which attack
|
||||
frameworks to use, optional
|
||||
:param config: Configuration parameters to guide the assessment process, optional
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
"""
|
||||
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
|
||||
super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
|
||||
attack_strategy_utils, config)
|
||||
super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
|
||||
attack_strategy_utils)
|
||||
if config.compute_distance:
|
||||
self.knn_learner_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance,
|
||||
metric_params=config.distance_params)
|
||||
|
|
@ -89,7 +87,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
|
|||
Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
|
||||
DCR computed by 'calculate_distances()'.
|
||||
:return:
|
||||
:score of the attack, based on the NN distances from the query samples to the synthetic data samples
|
||||
score of the attack, based on the NN distances from the query samples to the synthetic data samples
|
||||
"""
|
||||
member_distances, non_member_distances = self.calculate_distances()
|
||||
# distance of the synth. records to members and to non-members
|
||||
|
|
@ -111,8 +109,8 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
|
|||
N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks."
|
||||
|
||||
:return:
|
||||
pos_distances: distances of each synthetic data member from its nearest training sample
|
||||
neg_distances: distances of each synthetic data member from its nearest validation sample
|
||||
pos_distances - distances of each synthetic data member from its nearest training sample
|
||||
neg_distances - distances of each synthetic data member from its nearest validation sample
|
||||
"""
|
||||
# nearest neighbor search
|
||||
self.attack_strategy_utils.fit(self.knn_learner_members, self.original_data_members)
|
||||
|
|
|
|||
|
|
@ -60,8 +60,8 @@ def test_risk_anonymization(name, data, dataset_type, k, mgr):
|
|||
original_data_members = ArrayDataset(preprocessed_x_train, y_train)
|
||||
original_data_non_members = ArrayDataset(preprocessed_x_test, y_test)
|
||||
|
||||
score_g, score_h = mgr.assess(original_data_members, original_data_non_members, anonymized_data,
|
||||
f'anon_k{k}_{name}')
|
||||
[score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, anonymized_data,
|
||||
f'anon_k{k}_{name}')
|
||||
assert (score_g.roc_auc_score > 0.5)
|
||||
assert (score_g.average_precision_score > 0.5)
|
||||
|
||||
|
|
@ -96,8 +96,8 @@ def test_risk_kde(name, data, dataset_type, mgr):
|
|||
original_data_members = ArrayDataset(encoded, y_train)
|
||||
original_data_non_members = ArrayDataset(encoded_test, y_test)
|
||||
|
||||
score_g, score_h = mgr.assess(original_data_members, original_data_non_members, synth_data,
|
||||
'kde' + str(NUM_SYNTH_SAMPLES) + name)
|
||||
[score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, synth_data,
|
||||
'kde' + str(NUM_SYNTH_SAMPLES) + name)
|
||||
|
||||
assert (score_g.roc_auc_score > 0.5)
|
||||
assert (score_g.average_precision_score > 0.5)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue