Rename gan_leaks=>per_record_knn_probabilities and holdout=>whole_dataset_knn_distance

Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
Maya Anderson 2023-03-07 22:54:04 +02:00
parent e7e725ea80
commit e5f6089b23
3 changed files with 46 additions and 41 deletions

View file

@ -3,10 +3,10 @@ from typing import Optional
import pandas as pd
from apt.risk.data_assessment.dataset_attack_gan_leaks import DatasetAttackGanLeaksConfig, DatasetAttackGanLeaks, \
DatasetAttackScoreGanLeaks
from apt.risk.data_assessment.dataset_attack_holdout import DatasetAttackHoldoutConfig, DatasetAttackHoldout, \
DatasetAttackScoreHoldout
from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import DatasetAttackConfigPerRecordKnnProbabilities, \
DatasetAttackPerRecordKnnProbabilities, DatasetAttackScorePerRecordKnnProbabilities
from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import DatasetAttackConfigWholeDatasetKnnDistance, \
DatasetAttackWholeDatasetKnnDistance, DatasetAttackScoreWholeDatasetKnnDistance
from apt.utils.datasets import ArrayDataset
@ -20,8 +20,8 @@ class DatasetAssessmentManager:
"""
The main class for running dataset assessment attacks.
"""
gan_leaks_attack_scores = []
holdout_attack_scores = []
attack_scores_per_record_knn_probabilities = []
attack_scores_whole_dataset_knn_distance = []
def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None:
"""
@ -31,32 +31,34 @@ class DatasetAssessmentManager:
def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> (
DatasetAttackScoreGanLeaks, DatasetAttackScoreHoldout):
config_gl = DatasetAttackGanLeaksConfig(use_batches=False, k=5)
mgr = DatasetAttackGanLeaks(original_data_members,
original_data_non_members,
synthetic_data,
dataset_name,
config_gl)
DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance):
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False, k=5)
mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
original_data_non_members,
synthetic_data,
dataset_name,
config_gl)
result = mgr.assess_privacy()
score_g = mgr.calculate_privacy_score(result, generate_plot=self.config.generate_plots)
self.gan_leaks_attack_scores.append(score_g)
self.attack_scores_per_record_knn_probabilities.append(score_g)
config_h = DatasetAttackHoldoutConfig(use_batches=False, k=5)
mgr_h = DatasetAttackHoldout(original_data_members, original_data_non_members, synthetic_data,
dataset_name,
config_h)
config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False, k=5)
mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data,
dataset_name,
config_h)
score_h = mgr_h.assess_privacy()
self.holdout_attack_scores.append(score_h)
self.attack_scores_whole_dataset_knn_distance.append(score_h)
return score_g, score_h
def dump_all_scores_to_files(self):
if self.config.persist_reports:
results_log_file = "_results.log.csv"
self.dump_scores_to_file(self.gan_leaks_attack_scores, "gan_leaks" + results_log_file, True)
self.dump_scores_to_file(self.holdout_attack_scores, "holdout" + results_log_file, True)
self.dump_scores_to_file(self.attack_scores_per_record_knn_probabilities,
"per_record_knn_probabilities" + results_log_file, True)
self.dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance,
"whole_dataset_knn_distance" + results_log_file, True)
@staticmethod
def dump_scores_to_file(attack_scores, filename, header: bool):

View file

@ -16,8 +16,8 @@ from apt.utils.datasets import ArrayDataset
@dataclass
class DatasetAttackHoldoutConfig(Config):
"""Configuration for DatasetAttackHoldout.
class DatasetAttackConfigWholeDatasetKnnDistance(Config):
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
Attributes:
k: Number of nearest neighbors to search
@ -37,18 +37,18 @@ class DatasetAttackHoldoutConfig(Config):
@dataclass
class DatasetAttackScoreHoldout(DatasetAttackScore):
"""Configuration for DatasetAttackHoldout.
class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
Attributes
----------
share : the share of synthetic records closer to the training than the holdout dataset
assessment_type : assessment type is 'Holdout', to be used in reports
assessment_type : assessment type is 'WholeDatasetKnnDistance', to be used in reports
"""
share: float
assessment_type: str = 'Holdout'
assessment_type: str = 'WholeDatasetKnnDistance'
class DatasetAttackHoldout(DatasetAttackWhole):
class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
"""
Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
@ -57,7 +57,7 @@ class DatasetAttackHoldout(DatasetAttackWhole):
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str,
config: Optional[DatasetAttackHoldoutConfig] = DatasetAttackHoldoutConfig()):
config: Optional[DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
@ -80,7 +80,7 @@ class DatasetAttackHoldout(DatasetAttackWhole):
self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
def assess_privacy(self) -> DatasetAttackScoreHoldout:
def assess_privacy(self) -> DatasetAttackScoreWholeDatasetKnnDistance:
"""
Calculate the share of synthetic records closer to the training than the holdout dataset
:return:
@ -94,7 +94,7 @@ class DatasetAttackHoldout(DatasetAttackWhole):
# and half those whose distance is similar to members and non-members
share = np.mean(member_distances < non_member_distances) + 0.5 * np.mean(
member_distances == non_member_distances)
score = DatasetAttackScoreHoldout(self.dataset_name, share=share)
score = DatasetAttackScoreWholeDatasetKnnDistance(self.dataset_name, share=share)
return score
def calculate_distances(self):

View file

@ -17,8 +17,8 @@ from apt.utils.datasets import ArrayDataset
@dataclass
class DatasetAttackGanLeaksConfig(Config):
"""Configuration for DatasetAttackGanLeaks.
class DatasetAttackConfigPerRecordKnnProbabilities(Config):
"""Configuration for DatasetAttackPerRecordKnnProbabilities.
Attributes:
k: Number of nearest neighbors to search
@ -38,20 +38,21 @@ class DatasetAttackGanLeaksConfig(Config):
@dataclass
class DatasetAttackScoreGanLeaks(DatasetAttackScore):
"""DatasetAttackGanLeaks privacy score.
class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScore):
"""DatasetAttackPerRecordKnnProbabilities privacy score.
Attributes
----------
roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack performance.
roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
performance.
average_precision_score: the proportion of Predicted Positive cases that are correctly Real Positives (members)
assessment_type : assessment type is 'GANLeaks', to be used in reports
assessment_type : assessment type is 'PerRecordKnnProbabilities', to be used in reports
"""
roc_auc_score: float
average_precision_score: float
assessment_type: str = 'GANLeaks'
assessment_type: str = 'PerRecordKnnProbabilities'
class DatasetAttackGanLeaks(DatasetAttackPerRecord):
class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
"""
Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of
members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.
@ -60,7 +61,8 @@ class DatasetAttackGanLeaks(DatasetAttackPerRecord):
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str,
config: Optional[DatasetAttackGanLeaksConfig] = DatasetAttackGanLeaksConfig()):
config: Optional[
DatasetAttackConfigPerRecordKnnProbabilities] = DatasetAttackConfigPerRecordKnnProbabilities()):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
@ -122,7 +124,8 @@ class DatasetAttackGanLeaks(DatasetAttackPerRecord):
pos_proba, neg_proba = \
dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities
fpr, tpr, threshold, auc, ap = self.calculate_metrics(pos_proba, neg_proba)
score = DatasetAttackScoreGanLeaks(self.dataset_name, roc_auc_score=auc, average_precision_score=ap)
score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name, roc_auc_score=auc,
average_precision_score=ap)
if generate_plot:
self.plot_roc_curve(pos_proba, neg_proba)
return score