Add a risk score to the base class DatasetAttackScore, so that every implementation could set it based on its specific values.

Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
Maya Anderson 2023-03-16 17:51:49 +02:00
parent 8a4df5a4a2
commit 80bec0c45b
7 changed files with 53 additions and 36 deletions

View file

@ -39,8 +39,8 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
:param query_samples: query samples, to which nearest neighbors are to be found
:param knn_learner: unsupervised learner for implementing neighbor searches, after it was fitted
:param distance_processor: function for processing the distance into another more relevant metric per sample.
Its input is an array representing distances (the distances returned by NearestNeighbors.kneighbors() ),
and the output should be another array with distance-based values that enable to compute the final score
Its input is an array representing distances (the distances returned by NearestNeighbors.kneighbors() ), and
the output should be another array with distance-based values that enable to compute the final risk score
:return:
distances of the query samples to their nearest neighbors, or a metric based on that distance and calculated
by the distance_processor function

View file

@ -45,7 +45,7 @@ class DatasetAssessmentManager:
:param dataset_name: A name to identify this dataset, optional
:return:
a list of dataset attack scores
a list of dataset attack risk scores
"""
config_gl = DatasetAttackConfigMembershipKnnProbabilities(use_batches=False,
generate_plot=self.config.generate_plots)

View file

@ -54,7 +54,7 @@ class DatasetAttack(abc.ABC):
"""
Assess the privacy of the dataset
:return:
score: DatasetAttackScore the privacy attack score
score: DatasetAttackScore the privacy attack risk score
"""
pass

View file

@ -42,17 +42,24 @@ class DatasetAttackConfigMembershipKnnProbabilities(Config):
@dataclass
class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore):
"""DatasetAttackMembershipKnnProbabilities privacy score.
Attributes
----------
roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
performance.
average_precision_score: the proportion of predicted members that are correctly members
assessment_type : assessment type is 'MembershipKnnProbabilities', to be used in reports
"""DatasetAttackMembershipKnnProbabilities privacy risk score.
"""
roc_auc_score: float = -1.0
average_precision_score: float = -1.0
assessment_type: str = 'MembershipKnnProbabilities'
roc_auc_score: float
average_precision_score: float
assessment_type: str = 'MembershipKnnProbabilities' # to be used in reports
def __init__(self, dataset_name: str, roc_auc_score: float, average_precision_score: float,
result: DatasetAttackResultMembership) -> None:
"""
dataset_name: dataset name to be used in reports
roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
performance.
average_precision_score: the proportion of predicted members that are correctly members
result: the result of the membership inference attack
"""
super().__init__(dataset_name=dataset_name, risk_score=roc_auc_score, result=result)
self.roc_auc_score = roc_auc_score
self.average_precision_score = average_precision_score
class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):

View file

@ -14,7 +14,8 @@ class DatasetAttackResult:
@dataclass
class DatasetAttackScore:
dataset_name: str
result: Optional[DatasetAttackResult] = None
risk_score: float
result: Optional[DatasetAttackResult]
@dataclass

View file

@ -39,15 +39,19 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config):
@dataclass
class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
"""DatasetAttackWholeDatasetKnnDistance privacy score.
Attributes
----------
share : the share of synthetic records closer to the training than the holdout dataset.
A value of 0.5 or close to it means good privacy.
assessment_type : assessment type is 'WholeDatasetKnnDistance', to be used in reports
"""DatasetAttackWholeDatasetKnnDistance privacy risk score.
"""
share: float = -1.0
assessment_type: str = 'WholeDatasetKnnDistance'
share: float
assessment_type: str = 'WholeDatasetKnnDistance' # to be used in reports
def __init__(self, dataset_name, share) -> None:
"""
dataset_name: dataset name to be used in reports
share : the share of synthetic records closer to the training than the holdout dataset.
A value of 0.5 or close to it means good privacy.
"""
super().__init__(dataset_name=dataset_name, risk_score=share, result=None)
self.share = share
class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):

View file

@ -14,6 +14,10 @@ from apt.utils.dataset_utils import get_iris_dataset_np, get_diabetes_dataset_np
get_nursery_dataset_pd
from apt.utils.datasets import ArrayDataset
MIN_SHARE = 0.5
MIN_ROC_AUC = 0.0
MIN_PRECISION = 0.0
NUM_SYNTH_SAMPLES = 40000
NUM_SYNTH_COMPONENTS = 4
@ -60,12 +64,9 @@ def test_risk_anonymization(name, data, dataset_type, k, mgr):
original_data_members = ArrayDataset(preprocessed_x_train, y_train)
original_data_non_members = ArrayDataset(preprocessed_x_test, y_test)
[score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, anonymized_data,
f'anon_k{k}_{name}')
assert (score_g.roc_auc_score > 0.5)
assert (score_g.average_precision_score > 0.5)
assert (score_h.share > 0.5)
dataset_name = f'anon_k{k}_{name}'
assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, anonymized_data,
dataset_name)
testdata = [('iris_np', iris_dataset_np, 'np', mgr),
@ -96,13 +97,8 @@ def test_risk_kde(name, data, dataset_type, mgr):
original_data_members = ArrayDataset(encoded, y_train)
original_data_non_members = ArrayDataset(encoded_test, y_test)
[score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, synth_data,
'kde' + str(NUM_SYNTH_SAMPLES) + name)
assert (score_g.roc_auc_score > 0.5)
assert (score_g.average_precision_score > 0.5)
assert (score_h.share > 0.5)
dataset_name = 'kde' + str(NUM_SYNTH_SAMPLES) + name
assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, synth_data, dataset_name)
def kde(n_samples, n_components, original_data):
@ -166,3 +162,12 @@ def preprocess_nursery_x_data(x_train, x_test):
encoded = preprocessor.fit_transform(x_train)
encoded_test = preprocessor.fit_transform(x_test)
return encoded, encoded_test
def assess_privacy_and_validate_result(dataset_assessment_manager, original_data_members, original_data_non_members,
synth_data, dataset_name):
[score_g, score_h] = dataset_assessment_manager.assess(original_data_members, original_data_non_members, synth_data,
dataset_name)
assert (score_g.roc_auc_score > MIN_ROC_AUC)
assert (score_g.average_precision_score > MIN_PRECISION)
assert (score_h.share > MIN_SHARE)