Unite the interface so that the main method assess_privacy always returns a score, but the score may also contain an attack result, which can be further analyzed

Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
Maya Anderson 2023-03-08 12:25:58 +02:00
parent 69a9a8fa2b
commit 3ae64054f8
6 changed files with 42 additions and 54 deletions

View file

@ -3,7 +3,9 @@ Module providing privacy risk assessment for synthetic data.
The main interface, ``DatasetAttack``, with the assess_privacy() main method assumes the availability of the
training data, holdout data and synthetic data at the time of the privacy evaluation.
It is implemented by two types of abstract classes: ``DatasetAttackPerRecord`` and ``DatasetAttackWhole``, to be
implemented by concrete assessment methods.
It is to be implemented by concrete assessment methods, which can run the assessment on a per-record level,
or on the whole dataset.
The abstract class ``DatasetAttackPerRecord`` implements the ``DatasetAttack`` interface, but adds the result
of the attack, so that the final score contains both the result for further analysis and the calculated score.
"""
from apt.risk.data_assessment import dataset_attack

View file

@ -34,15 +34,15 @@ class DatasetAssessmentManager:
def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> (
DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance):
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False)
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False,
generate_plot=self.config.generate_plots)
mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
original_data_non_members,
synthetic_data,
dataset_name,
config_gl)
result = mgr.assess_privacy()
score_g = mgr.calculate_privacy_score(result, generate_plot=self.config.generate_plots)
score_g = mgr.assess_privacy()
self.attack_scores_per_record_knn_probabilities.append(score_g)
config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)

View file

@ -2,7 +2,7 @@
This module defines the interface for privacy risk assessment of synthetic datasets.
"""
import abc
from typing import Optional, Union
from typing import Optional
import matplotlib.pyplot as plt
import numpy as np
@ -10,8 +10,7 @@ from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from apt.risk.data_assessment.attack_strategy_utils import AttackStrategyUtils
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \
DatasetAttackResult
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord
from apt.utils.datasets import ArrayDataset
@ -52,12 +51,11 @@ class DatasetAttack(abc.ABC):
self.config = config
@abc.abstractmethod
def assess_privacy(self) -> Union[DatasetAttackScore, DatasetAttackResult]:
def assess_privacy(self) -> DatasetAttackScore:
"""
Assess the privacy of the dataset
:return:
result: Union[DatasetAttackScore, DatasetAttackResult] can be either the final privacy attack score,
or an intermediate attack result, which can be translated into a privacy score if needed
score: DatasetAttackScore the privacy attack score
"""
pass
@ -67,22 +65,13 @@ class DatasetAttackPerRecord(DatasetAttack):
An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level.
"""
@abc.abstractmethod
def assess_privacy(self) -> DatasetAttackResultPerRecord:
"""
Assess the privacy of the dataset
:return:
result: DatasetAttackResultPerRecord
"""
pass
@abc.abstractmethod
def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord,
generate_plot=False) -> DatasetAttackScore:
"""
Calculate dataset privacy score based on the result of the privacy assessment
Calculate dataset privacy score based on the result of the privacy attack
:return:
result: DatasetAttackScore
score: DatasetAttackScore
"""
pass
@ -120,18 +109,3 @@ class DatasetAttackPerRecord(DatasetAttack):
auc = metrics.roc_auc_score(labels, results)
ap = metrics.average_precision_score(labels, results)
return fpr, tpr, threshold, auc, ap
class DatasetAttackWhole(DatasetAttack):
"""
An abstract base class for performing privacy risk assessment for synthetic datasets on a whole-dataset level.
"""
@abc.abstractmethod
def assess_privacy(self) -> DatasetAttackScore:
"""
Assess the privacy of the dataset
:return:
result: DatasetAttackScore
"""
pass

View file

@ -1,19 +1,24 @@
from dataclasses import dataclass
from dataclasses import dataclass, field
import numpy as np
@dataclass
class DatasetAttackScore:
dataset_name: str
@dataclass
class DatasetAttackResult:
dataset_name: str
@dataclass
class DatasetAttackResultPerRecord(DatasetAttackResult):
positive_probabilities: np.ndarray
negative_probabilities: np.ndarray
class DatasetAttackScoreWithResult(DatasetAttackScore):
result: DatasetAttackResult = field(repr=False)
@dataclass
class DatasetAttackScore:
dataset_name: str
class DatasetAttackResultPerRecord(DatasetAttackResult):
positive_probabilities: np.ndarray
negative_probabilities: np.ndarray

View file

@ -12,7 +12,7 @@ import numpy as np
from sklearn.neighbors import NearestNeighbors
from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
from apt.risk.data_assessment.dataset_attack import DatasetAttackWhole, Config
from apt.risk.data_assessment.dataset_attack import Config, DatasetAttack
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore
from apt.utils.datasets import ArrayDataset
@ -51,7 +51,7 @@ class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
assessment_type: str = 'WholeDatasetKnnDistance'
class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
"""
Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
@ -89,7 +89,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
DCR computed by 'calculate_distances()'.
:return:
:result of the attack, based on the NN distances from the query samples to the synthetic data samples
:score of the attack, based on the NN distances from the query samples to the synthetic data samples
"""
member_distances, non_member_distances = self.calculate_distances()
# distance of the synth. records to members and to non-members

View file

@ -12,7 +12,8 @@ from sklearn.neighbors import NearestNeighbors
from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils
from apt.risk.data_assessment.dataset_attack import DatasetAttackPerRecord, Config
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultPerRecord, \
DatasetAttackScoreWithResult
from apt.utils.datasets import ArrayDataset
@ -29,16 +30,18 @@ class DatasetAttackConfigPerRecordKnnProbabilities(Config):
See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
sklearn.neighbors.NearestNeighbors documentation.
generate_plot: Generate or not an AUR ROC curve and persist it in a file
"""
k: int = 5
use_batches: bool = False
batch_size: int = 10
compute_distance: Callable = None
distance_params: dict = None
generate_plot: bool = False
@dataclass
class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScore):
class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScoreWithResult):
"""DatasetAttackPerRecordKnnProbabilities privacy score.
Attributes
----------
@ -81,7 +84,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
else:
self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
def assess_privacy(self) -> DatasetAttackResultPerRecord:
def assess_privacy(self) -> DatasetAttackScorePerRecordKnnProbabilities:
"""
Membership Inference Attack which calculates probabilities of positive and negative samples to be generated by
the synthetic data generator.
@ -94,8 +97,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
query samples to the synthetic data samples.
:return
:result of the attack with the probabilities of positive and negative samples to be generated by the
synthetic data generator based on the NN distances from the query samples to the synthetic data samples
:score Privacy score of the attack together with the attack result with the probabilities of positive and
negative samples to be generated by the synthetic data generator based on the NN distances from the
query samples to the synthetic data samples
"""
# nearest neighbor search
self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data)
@ -110,7 +114,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
result = DatasetAttackResultPerRecord(self.dataset_name, positive_probabilities=pos_proba,
negative_probabilities=neg_proba)
return result
score = self.calculate_privacy_score(result, self.config.generate_plot)
return score
def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultPerRecord,
generate_plot=False) -> DatasetAttackScore:
@ -126,8 +132,9 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
pos_proba, neg_proba = \
dataset_attack_result.positive_probabilities, dataset_attack_result.negative_probabilities
fpr, tpr, threshold, auc, ap = self.calculate_metrics(pos_proba, neg_proba)
score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name, roc_auc_score=auc,
average_precision_score=ap)
score = DatasetAttackScorePerRecordKnnProbabilities(self.dataset_name,
result=dataset_attack_result,
roc_auc_score=auc, average_precision_score=ap)
if generate_plot:
self.plot_roc_curve(pos_proba, neg_proba)
return score