Rename DatasetAttackPerRecordKnnProbabilities => DatasetAttackMembershipKnnProbabilities

Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
Maya Anderson 2023-03-09 23:17:37 +02:00
parent a122976807
commit 4988fea08c
6 changed files with 70 additions and 71 deletions

View file

@ -5,7 +5,8 @@ The main interface, ``DatasetAttack``, with the assess_privacy() main method ass
training data, holdout data and synthetic data at the time of the privacy evaluation.
It is to be implemented by concrete assessment methods, which can run the assessment on a per-record level,
or on the whole dataset.
The abstract class ``DatasetAttackPerRecord`` implements the ``DatasetAttack`` interface, but adds the result
of the attack, so that the final score contains both the result for further analysis and the calculated score.
The abstract class ``DatasetAttackMembership`` implements the ``DatasetAttack`` interface, but adds the result
of the membership inference attack, so that the final score contains both the membership inference attack result
for further analysis and the calculated score.
"""
from apt.risk.data_assessment import dataset_attack

View file

@ -7,11 +7,9 @@ import pandas as pd
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME
from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \
DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \
DatasetAttackScoreWholeDatasetKnnDistance
from apt.risk.data_assessment.dataset_attack_per_record_knn_probabilities import \
DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \
DatasetAttackScorePerRecordKnnProbabilities
DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance
from apt.risk.data_assessment.dataset_attack_membership_knn_probabilities import \
DatasetAttackConfigMembershipKnnProbabilities, DatasetAttackMembershipKnnProbabilities
from apt.utils.datasets import ArrayDataset
@ -49,13 +47,13 @@ class DatasetAssessmentManager:
:return:
a list of dataset attack scores
"""
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False,
generate_plot=self.config.generate_plots)
mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
original_data_non_members,
synthetic_data,
config_gl,
dataset_name)
config_gl = DatasetAttackConfigMembershipKnnProbabilities(use_batches=False,
generate_plot=self.config.generate_plots)
mgr = DatasetAttackMembershipKnnProbabilities(original_data_members,
original_data_non_members,
synthetic_data,
config_gl,
dataset_name)
score_g = mgr.assess_privacy()
self.attack_scores_per_record_knn_probabilities.append(score_g)

View file

@ -10,7 +10,7 @@ from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from apt.risk.data_assessment.attack_strategy_utils import AttackStrategyUtils
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultMembership
from apt.utils.datasets import ArrayDataset
@ -60,13 +60,13 @@ class DatasetAttack(abc.ABC):
pass
class DatasetAttackPerRecord(DatasetAttack):
class DatasetAttackMembership(DatasetAttack):
"""
An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level.
"""
@abc.abstractmethod
def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord,
def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership,
generate_plot=False) -> DatasetAttackScore:
"""
Calculate dataset privacy score based on the result of the privacy attack
@ -75,15 +75,15 @@ class DatasetAttackPerRecord(DatasetAttack):
"""
pass
def plot_roc_curve(self, pos_probabilities, neg_probabilities, name_prefix=""):
def plot_roc_curve(self, member_probabilities, non_member_probabilities, name_prefix=""):
"""
Plot ROC curve
:param pos_probabilities: probability estimates of the positive samples, the training data
:param neg_probabilities: probability estimates of the negative samples, the hold-out data
:param member_probabilities: probability estimates of the member samples, the training data
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data
:param name_prefix: name prefix for the ROC curve plot
"""
labels = np.concatenate((np.zeros((len(neg_probabilities),)), np.ones((len(pos_probabilities),))))
results = np.concatenate((neg_probabilities, pos_probabilities))
labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities),))))
results = np.concatenate((non_member_probabilities, member_probabilities))
svc_disp = RocCurveDisplay.from_predictions(labels, results)
svc_disp.plot()
plt.plot([0, 1], [0, 1], color="navy", linewidth=2, linestyle="--", label='No skills')
@ -91,11 +91,11 @@ class DatasetAttackPerRecord(DatasetAttack):
plt.savefig(f'{name_prefix}{self.dataset_name}_roc_curve.png')
@staticmethod
def calculate_metrics(pos_probabilities, neg_probabilities):
def calculate_metrics(member_probabilities, non_member_probabilities):
"""
Calculate attack performance metrics
:param pos_probabilities: probability estimates of the positive samples, the training data
:param neg_probabilities: probability estimates of the negative samples, the hold-out data
:param member_probabilities: probability estimates of the member samples, the training data
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data
:return:
fpr: False Positive rate
tpr: True Positive rate
@ -103,8 +103,8 @@ class DatasetAttackPerRecord(DatasetAttack):
auc: area under the Receiver Operating Characteristic Curve
ap: average precision score
"""
labels = np.concatenate((np.zeros((len(neg_probabilities),)), np.ones((len(pos_probabilities)))))
results = np.concatenate((neg_probabilities, pos_probabilities))
labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities)))))
results = np.concatenate((non_member_probabilities, member_probabilities))
fpr, tpr, threshold = metrics.roc_curve(labels, results, pos_label=1)
auc = metrics.roc_auc_score(labels, results)
ap = metrics.average_precision_score(labels, results)

View file

@ -11,15 +11,15 @@ import numpy as np
from sklearn.neighbors import NearestNeighbors
from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
from apt.risk.data_assessment.dataset_attack import DatasetAttackPerRecord, Config
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \
from apt.risk.data_assessment.dataset_attack import DatasetAttackMembership, Config
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultMembership, \
DatasetAttackScoreWithResult, DEFAULT_DATASET_NAME
from apt.utils.datasets import ArrayDataset
@dataclass
class DatasetAttackConfigPerRecordKnnProbabilities(Config):
"""Configuration for DatasetAttackPerRecordKnnProbabilities.
class DatasetAttackConfigMembershipKnnProbabilities(Config):
"""Configuration for DatasetAttackMembershipKnnProbabilities.
Attributes:
k: Number of nearest neighbors to search
@ -41,21 +41,21 @@ class DatasetAttackConfigPerRecordKnnProbabilities(Config):
@dataclass
class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScoreWithResult):
"""DatasetAttackPerRecordKnnProbabilities privacy score.
class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScoreWithResult):
"""DatasetAttackMembershipKnnProbabilities privacy score.
Attributes
----------
roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
performance.
average_precision_score: the proportion of Predicted Positive cases that are correctly Real Positives (members)
assessment_type : assessment type is 'PerRecordKnnProbabilities', to be used in reports
average_precision_score: the proportion of predicted members that are correctly members
assessment_type : assessment type is 'MembershipKnnProbabilities', to be used in reports
"""
roc_auc_score: float
average_precision_score: float
assessment_type: str = 'PerRecordKnnProbabilities'
assessment_type: str = 'MembershipKnnProbabilities'
class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
"""
Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of
members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.
@ -66,7 +66,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset,
config: DatasetAttackConfigPerRecordKnnProbabilities = DatasetAttackConfigPerRecordKnnProbabilities(),
config: DatasetAttackConfigMembershipKnnProbabilities = DatasetAttackConfigMembershipKnnProbabilities(),
dataset_name: str = DEFAULT_DATASET_NAME):
"""
:param original_data_members: A container for the training original samples and labels
@ -84,9 +84,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
else:
self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
def assess_privacy(self) -> DatasetAttackScorePerRecordKnnProbabilities:
def assess_privacy(self) -> DatasetAttackScoreMembershipKnnProbabilities:
"""
Membership Inference Attack which calculates probabilities of positive and negative samples to be generated by
Membership Inference Attack which calculates probabilities of member and non-member samples to be generated by
the synthetic data generator.
The assumption is that since the generative model is trained to approximate the training data distribution
then the probability of a sample to be a member of the training data should be proportional to the probability
@ -97,46 +97,46 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
query samples to the synthetic data samples.
:return:
Privacy score of the attack together with the attack result with the probabilities of positive and
negative samples to be generated by the synthetic data generator based on the NN distances from the
Privacy score of the attack together with the attack result with the probabilities of member and
non-member samples to be generated by the synthetic data generator based on the NN distances from the
query samples to the synthetic data samples
"""
# nearest neighbor search
self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data)
# positive query
pos_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_members,
self.probability_per_sample)
# members query
member_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_members,
self.probability_per_sample)
# negative query
neg_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_non_members,
self.probability_per_sample)
# non-members query
non_member_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_non_members,
self.probability_per_sample)
result = DatasetAttackResultPerRecord(positive_probabilities=pos_proba,
negative_probabilities=neg_proba)
result = DatasetAttackResultMembership(member_probabilities=member_proba,
non_member_probabilities=non_member_proba)
score = self.calculate_privacy_score(result, self.config.generate_plot)
return score
def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord,
def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership,
generate_plot=False) -> DatasetAttackScore:
"""
Evaluate privacy score from the probabilities of positive and negative samples to be generated by the synthetic
Evaluate privacy score from the probabilities of member and non-member samples to be generated by the synthetic
data generator. The probabilities are computed by the 'assess_privacy()' method.
:param dataset_attack_result attack result containing probabilities of positive and negative samples to be
:param dataset_attack_result attack result containing probabilities of member and non-member samples to be
generated by the synthetic data generator
:param generate_plot generate AUC ROC curve plot and persist it
:return:
score of the attack, based on distance-based probabilities - mainly the ROC AUC score
"""
pos_proba, neg_proba = \
dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities
fpr, tpr, threshold, auc, ap = self.calculate_metrics(pos_proba, neg_proba)
score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name,
result=dataset_attack_result,
roc_auc_score=auc, average_precision_score=ap)
member_proba, non_member_proba = \
dataset_attack_result.member_probabilities, dataset_attack_result.non_member_probabilities
fpr, tpr, threshold, auc, ap = self.calculate_metrics(member_proba, non_member_proba)
score = DatasetAttackScoreMembershipKnnProbabilities(self.dataset_name,
result=dataset_attack_result,
roc_auc_score=auc, average_precision_score=ap)
if generate_plot:
self.plot_roc_curve(pos_proba, neg_proba)
self.plot_roc_curve(member_proba, non_member_proba)
return score
@staticmethod

View file

@ -2,9 +2,9 @@ from dataclasses import dataclass, field
import numpy as np
DEFAULT_DATASET_NAME = "dataset"
@dataclass
class DatasetAttackScore:
dataset_name: str
@ -21,6 +21,6 @@ class DatasetAttackScoreWithResult(DatasetAttackScore):
@dataclass
class DatasetAttackResultPerRecord(DatasetAttackResult):
positive_probabilities: np.ndarray
negative_probabilities: np.ndarray
class DatasetAttackResultMembership(DatasetAttackResult):
member_probabilities: np.ndarray
non_member_probabilities: np.ndarray

View file

@ -104,20 +104,20 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
def calculate_distances(self):
"""
Calculate positive and negative query probabilities, based on their distance to their KNN among
Calculate member and non-member query probabilities, based on their distance to their KNN among
synthetic samples. This distance is called distance to the closest record (DCR), as defined by
N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks."
:return:
pos_distances - distances of each synthetic data member from its nearest training sample
neg_distances - distances of each synthetic data member from its nearest validation sample
member_distances - distances of each synthetic data member from its nearest training sample
non_member_distances - distances of each synthetic data member from its nearest validation sample
"""
# nearest neighbor search
self.attack_strategy_utils.fit(self.knn_learner_members, self.original_data_members)
self.attack_strategy_utils.fit(self.knn_learner_non_members, self.original_data_non_members)
# distances of the synthetic data from the positive and negative samples (members and non-members)
pos_distances = self.attack_strategy_utils.find_knn(self.knn_learner_members, self.synthetic_data)
neg_distances = self.attack_strategy_utils.find_knn(self.knn_learner_non_members, self.synthetic_data)
# distances of the synthetic data from the member and non-member samples
member_distances = self.attack_strategy_utils.find_knn(self.knn_learner_members, self.synthetic_data)
non_member_distances = self.attack_strategy_utils.find_knn(self.knn_learner_non_members, self.synthetic_data)
return pos_distances, neg_distances
return member_distances, non_member_distances