mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-26 21:36:22 +02:00
Increase version to 0.2.0 (#74)
* Remove tensorflow dependency if not using keras model * Remove xgboost dependency if not using xgboost model * Documentation updates Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
782edabd58
commit
8a9ef80146
25 changed files with 306 additions and 152 deletions
|
|
@ -9,21 +9,20 @@ from apt.utils.datasets import ArrayDataset
|
|||
|
||||
class AttackStrategyUtils(abc.ABC):
|
||||
"""
|
||||
Abstract base class for common utilities of various privacy attack strategies.
|
||||
Abstract base class for common utilities of various privacy attack strategies.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class KNNAttackStrategyUtils(AttackStrategyUtils):
|
||||
"""
|
||||
Common utilities for attack strategy based on KNN distances.
|
||||
Common utilities for attack strategy based on KNN distances.
|
||||
|
||||
:param use_batches: Use batches with a progress meter or not when finding KNNs for query set.
|
||||
:param batch_size: if use_batches=True, the size of batch_size should be > 0.
|
||||
"""
|
||||
|
||||
def __init__(self, use_batches: bool = False, batch_size: int = 10) -> None:
|
||||
"""
|
||||
:param use_batches: Use batches with a progress meter or not when finding KNNs for query set
|
||||
:param batch_size: if use_batches=True, the size of batch_size should be > 0
|
||||
"""
|
||||
self.use_batches = use_batches
|
||||
self.batch_size = batch_size
|
||||
if use_batches:
|
||||
|
|
@ -31,11 +30,18 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
|
|||
raise ValueError(f"When using batching batch_size should be > 0, and not {batch_size}")
|
||||
|
||||
def fit(self, knn_learner: NearestNeighbors, dataset: ArrayDataset):
|
||||
"""
|
||||
Fit the KNN learner.
|
||||
|
||||
:param knn_learner: The KNN model to fit.
|
||||
:param dataset: The training set to fit the model on.
|
||||
"""
|
||||
knn_learner.fit(dataset.get_samples())
|
||||
|
||||
def find_knn(self, knn_learner: NearestNeighbors, query_samples: ArrayDataset, distance_processor=None):
|
||||
"""
|
||||
Nearest neighbor search function.
|
||||
|
||||
:param query_samples: query samples, to which nearest neighbors are to be found
|
||||
:param knn_learner: unsupervised learner for implementing neighbor searches, after it was fitted
|
||||
:param distance_processor: function for processing the distance into another more relevant metric per sample.
|
||||
|
|
|
|||
|
|
@ -15,6 +15,12 @@ from apt.utils.datasets import ArrayDataset
|
|||
|
||||
@dataclass
|
||||
class DatasetAssessmentManagerConfig:
|
||||
"""
|
||||
Configuration for DatasetAssessmentManager.
|
||||
|
||||
:param persist_reports: Whether to save assessment results to filesystem.
|
||||
:param generate_plots: Whether to generate and visualize plots as part of assessment.
|
||||
"""
|
||||
persist_reports: bool = False
|
||||
generate_plots: bool = False
|
||||
|
||||
|
|
@ -22,14 +28,13 @@ class DatasetAssessmentManagerConfig:
|
|||
class DatasetAssessmentManager:
|
||||
"""
|
||||
The main class for running dataset assessment attacks.
|
||||
|
||||
:param config: Configuration parameters to guide the dataset assessment process
|
||||
"""
|
||||
attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = []
|
||||
attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = []
|
||||
|
||||
def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None:
|
||||
"""
|
||||
:param config: Configuration parameters to guide the dataset assessment process
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
|
|
@ -67,14 +72,17 @@ class DatasetAssessmentManager:
|
|||
return [score_gl, score_h]
|
||||
|
||||
def dump_all_scores_to_files(self):
|
||||
"""
|
||||
Save assessment results to filesystem.
|
||||
"""
|
||||
if self.config.persist_reports:
|
||||
results_log_file = "_results.log.csv"
|
||||
self.dump_scores_to_file(self.attack_scores_per_record_knn_probabilities,
|
||||
self._dump_scores_to_file(self.attack_scores_per_record_knn_probabilities,
|
||||
"per_record_knn_probabilities" + results_log_file, True)
|
||||
self.dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance,
|
||||
self._dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance,
|
||||
"whole_dataset_knn_distance" + results_log_file, True)
|
||||
|
||||
@staticmethod
|
||||
def dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool):
|
||||
def _dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool):
|
||||
run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore') # don't serialize result
|
||||
run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w') # Overwrite
|
||||
|
|
|
|||
|
|
@ -16,32 +16,30 @@ from apt.utils.datasets import ArrayDataset
|
|||
|
||||
class Config(abc.ABC):
|
||||
"""
|
||||
The base class for dataset attack configurations
|
||||
The base class for dataset attack configurations
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DatasetAttack(abc.ABC):
|
||||
"""
|
||||
The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model
|
||||
training. The original data members (training data) and non-members (the holdout data) should be available.
|
||||
For reliability, all the datasets should be preprocessed and normalized.
|
||||
The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model
|
||||
training. The original data members (training data) and non-members (the holdout data) should be available.
|
||||
For reliability, all the datasets should be preprocessed and normalized.
|
||||
|
||||
:param original_data_members: A container for the training original samples and labels,
|
||||
only samples are used in the assessment
|
||||
:param original_data_non_members: A container for the holdout original samples and labels,
|
||||
only samples are used in the assessment
|
||||
:param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
|
||||
:param config: Configuration parameters to guide the assessment process
|
||||
:param dataset_name: A name to identify the dataset under attack, optional
|
||||
:param attack_strategy_utils: Utils for use with the attack strategy, optional
|
||||
"""
|
||||
|
||||
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset, config: Config, dataset_name: str,
|
||||
attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None:
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels,
|
||||
only samples are used in the assessment
|
||||
:param original_data_non_members: A container for the holdout original samples and labels,
|
||||
only samples are used in the assessment
|
||||
:param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
|
||||
:param config: Configuration parameters to guide the assessment process
|
||||
:param dataset_name: A name to identify the dataset under attack, optional
|
||||
:param attack_strategy_utils: Utils for use with the attack strategy, optional
|
||||
"""
|
||||
|
||||
self.original_data_members = original_data_members
|
||||
self.original_data_non_members = original_data_non_members
|
||||
self.synthetic_data = synthetic_data
|
||||
|
|
@ -52,7 +50,8 @@ class DatasetAttack(abc.ABC):
|
|||
@abc.abstractmethod
|
||||
def assess_privacy(self) -> DatasetAttackScore:
|
||||
"""
|
||||
Assess the privacy of the dataset
|
||||
Assess the privacy of the dataset.
|
||||
|
||||
:return:
|
||||
score: DatasetAttackScore the privacy attack risk score
|
||||
"""
|
||||
|
|
@ -61,14 +60,15 @@ class DatasetAttack(abc.ABC):
|
|||
|
||||
class DatasetAttackMembership(DatasetAttack):
|
||||
"""
|
||||
An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level.
|
||||
An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership,
|
||||
generate_plot: bool = False) -> DatasetAttackScore:
|
||||
"""
|
||||
Calculate dataset privacy score based on the result of the privacy attack
|
||||
Calculate dataset privacy score based on the result of the privacy attack.
|
||||
|
||||
:return:
|
||||
score: DatasetAttackScore
|
||||
"""
|
||||
|
|
@ -78,15 +78,16 @@ class DatasetAttackMembership(DatasetAttack):
|
|||
def plot_roc_curve(dataset_name: str, member_probabilities: np.ndarray, non_member_probabilities: np.ndarray,
|
||||
filename_prefix: str = ""):
|
||||
"""
|
||||
Plot ROC curve
|
||||
:param dataset_name: dataset name, will become part of the plot filename
|
||||
:param member_probabilities: probability estimates of the member samples, the training data
|
||||
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data
|
||||
:param filename_prefix: name prefix for the ROC curve plot
|
||||
Plot ROC curve.
|
||||
|
||||
:param dataset_name: dataset name, will become part of the plot filename.
|
||||
:param member_probabilities: probability estimates of the member samples, the training data.
|
||||
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data.
|
||||
:param filename_prefix: name prefix for the ROC curve plot.
|
||||
"""
|
||||
labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities),))))
|
||||
results = np.concatenate((non_member_probabilities, member_probabilities))
|
||||
svc_disp = RocCurveDisplay.from_predictions(labels, results)
|
||||
RocCurveDisplay.from_predictions(labels, results)
|
||||
plt.plot([0, 1], [0, 1], color="navy", linewidth=2, linestyle="--", label='No skills')
|
||||
plt.title('ROC curve')
|
||||
plt.savefig(f'{filename_prefix}{dataset_name}_roc_curve.png')
|
||||
|
|
@ -94,9 +95,10 @@ class DatasetAttackMembership(DatasetAttack):
|
|||
@staticmethod
|
||||
def calculate_metrics(member_probabilities: np.ndarray, non_member_probabilities: np.ndarray):
|
||||
"""
|
||||
Calculate attack performance metrics
|
||||
:param member_probabilities: probability estimates of the member samples, the training data
|
||||
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data
|
||||
Calculate attack performance metrics.
|
||||
|
||||
:param member_probabilities: probability estimates of the member samples, the training data.
|
||||
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data.
|
||||
:return:
|
||||
fpr: False Positive rate
|
||||
tpr: True Positive rate
|
||||
|
|
|
|||
|
|
@ -19,18 +19,18 @@ from apt.utils.datasets import ArrayDataset
|
|||
|
||||
@dataclass
|
||||
class DatasetAttackConfigMembershipKnnProbabilities(Config):
|
||||
"""Configuration for DatasetAttackMembershipKnnProbabilities.
|
||||
"""
|
||||
Configuration for DatasetAttackMembershipKnnProbabilities.
|
||||
|
||||
Attributes:
|
||||
k: Number of nearest neighbors to search
|
||||
use_batches: Divide query samples into batches or not.
|
||||
batch_size: Query sample batch size.
|
||||
compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return
|
||||
one value indicating the distance between those vectors.
|
||||
See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
|
||||
distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
|
||||
sklearn.neighbors.NearestNeighbors documentation.
|
||||
generate_plot: Generate or not an AUR ROC curve and persist it in a file
|
||||
:param k: Number of nearest neighbors to search.
|
||||
:param use_batches: Divide query samples into batches or not.
|
||||
:param batch_size: Query sample batch size.
|
||||
:param compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must
|
||||
return one value indicating the distance between those vectors.
|
||||
See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
|
||||
:param distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
|
||||
sklearn.neighbors.NearestNeighbors documentation.
|
||||
:param generate_plot: Generate or not an AUR ROC curve and persist it in a file.
|
||||
"""
|
||||
k: int = 5
|
||||
use_batches: bool = False
|
||||
|
|
@ -42,7 +42,14 @@ class DatasetAttackConfigMembershipKnnProbabilities(Config):
|
|||
|
||||
@dataclass
|
||||
class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore):
|
||||
"""DatasetAttackMembershipKnnProbabilities privacy risk score.
|
||||
"""
|
||||
DatasetAttackMembershipKnnProbabilities privacy risk score.
|
||||
|
||||
:param dataset_name: dataset name to be used in reports
|
||||
:param roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the
|
||||
attack performance.
|
||||
:param average_precision_score: the proportion of predicted members that are correctly members.
|
||||
:param result: the result of the membership inference attack.
|
||||
"""
|
||||
roc_auc_score: float
|
||||
average_precision_score: float
|
||||
|
|
@ -50,13 +57,6 @@ class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore):
|
|||
|
||||
def __init__(self, dataset_name: str, roc_auc_score: float, average_precision_score: float,
|
||||
result: DatasetAttackResultMembership) -> None:
|
||||
"""
|
||||
dataset_name: dataset name to be used in reports
|
||||
roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
|
||||
performance.
|
||||
average_precision_score: the proportion of predicted members that are correctly members
|
||||
result: the result of the membership inference attack
|
||||
"""
|
||||
super().__init__(dataset_name=dataset_name, risk_score=roc_auc_score, result=result)
|
||||
self.roc_auc_score = roc_auc_score
|
||||
self.average_precision_score = average_precision_score
|
||||
|
|
@ -64,24 +64,23 @@ class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore):
|
|||
|
||||
class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
|
||||
"""
|
||||
Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of
|
||||
members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.
|
||||
By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided
|
||||
in configuration instead.
|
||||
The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure.
|
||||
Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of
|
||||
members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.
|
||||
By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided
|
||||
in configuration instead.
|
||||
The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure.
|
||||
|
||||
:param original_data_members: A container for the training original samples and labels
|
||||
:param original_data_non_members: A container for the holdout original samples and labels
|
||||
:param synthetic_data: A container for the synthetic samples and labels
|
||||
:param config: Configuration parameters to guide the attack, optional
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
"""
|
||||
|
||||
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset,
|
||||
config: DatasetAttackConfigMembershipKnnProbabilities = DatasetAttackConfigMembershipKnnProbabilities(),
|
||||
dataset_name: str = DEFAULT_DATASET_NAME):
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels
|
||||
:param original_data_non_members: A container for the holdout original samples and labels
|
||||
:param synthetic_data: A container for the synthetic samples and labels
|
||||
:param config: Configuration parameters to guide the attack, optional
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
"""
|
||||
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
|
||||
super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
|
||||
attack_strategy_utils)
|
||||
|
|
@ -103,10 +102,9 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
|
|||
by the Parzen window density estimation in ``probability_per_sample()``, computed from the NN distances from the
|
||||
query samples to the synthetic data samples.
|
||||
|
||||
:return:
|
||||
Privacy score of the attack together with the attack result with the probabilities of member and
|
||||
non-member samples to be generated by the synthetic data generator based on the NN distances from the
|
||||
query samples to the synthetic data samples
|
||||
:return: Privacy score of the attack together with the attack result with the probabilities of member and
|
||||
non-member samples to be generated by the synthetic data generator based on the NN distances from the
|
||||
query samples to the synthetic data samples
|
||||
"""
|
||||
# nearest neighbor search
|
||||
self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data)
|
||||
|
|
@ -130,11 +128,11 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
|
|||
"""
|
||||
Evaluate privacy score from the probabilities of member and non-member samples to be generated by the synthetic
|
||||
data generator. The probabilities are computed by the ``assess_privacy()`` method.
|
||||
:param dataset_attack_result attack result containing probabilities of member and non-member samples to be
|
||||
generated by the synthetic data generator
|
||||
:param generate_plot generate AUC ROC curve plot and persist it
|
||||
:return:
|
||||
score of the attack, based on distance-based probabilities - mainly the ROC AUC score
|
||||
|
||||
:param dataset_attack_result: attack result containing probabilities of member and non-member samples to be
|
||||
generated by the synthetic data generator.
|
||||
:param generate_plot: generate AUC ROC curve plot and persist it.
|
||||
:return: score of the attack, based on distance-based probabilities - mainly the ROC AUC score.
|
||||
"""
|
||||
member_proba, non_member_proba = \
|
||||
dataset_attack_result.member_probabilities, dataset_attack_result.non_member_probabilities
|
||||
|
|
@ -151,10 +149,10 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
|
|||
"""
|
||||
For every sample represented by its distance from the query sample to its KNN in synthetic data,
|
||||
computes the probability of the synthetic data to be part of the query dataset.
|
||||
|
||||
:param distances: distance between every query sample in batch to its KNNs among synthetic samples, a numpy
|
||||
array of size (n, k) with n being the number of samples, k - the number of KNNs
|
||||
:return:
|
||||
probability estimates of the query samples being generated and so - of being part of the synthetic set, a
|
||||
numpy array of size (n,)
|
||||
array of size (n, k) with n being the number of samples, k - the number of KNNs.
|
||||
:return: probability estimates of the query samples being generated and so - of being part of the synthetic set,
|
||||
a numpy array of size (n,)
|
||||
"""
|
||||
return np.average(np.exp(-distances), axis=1)
|
||||
|
|
|
|||
|
|
@ -8,11 +8,21 @@ DEFAULT_DATASET_NAME = "dataset"
|
|||
|
||||
@dataclass
|
||||
class DatasetAttackResult:
|
||||
"""
|
||||
Basic class for storing privacy risk assessment results.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatasetAttackScore:
|
||||
"""
|
||||
Basic class for storing privacy risk assessment scores.
|
||||
|
||||
:param dataset_name: The name of the dataset that was assessed.
|
||||
:param risk_score: The privacy risk score.
|
||||
:param result: An optional list of more detailed results.
|
||||
"""
|
||||
dataset_name: str
|
||||
risk_score: float
|
||||
result: Optional[DatasetAttackResult]
|
||||
|
|
@ -20,5 +30,11 @@ class DatasetAttackScore:
|
|||
|
||||
@dataclass
|
||||
class DatasetAttackResultMembership(DatasetAttackResult):
|
||||
"""
|
||||
Class for storing membership attack results.
|
||||
|
||||
:param member_probabilities: The attack probabilities for member samples.
|
||||
:param non_member_probabilities: The attack probabilities for non-member samples.
|
||||
"""
|
||||
member_probabilities: np.ndarray
|
||||
non_member_probabilities: np.ndarray
|
||||
|
|
|
|||
|
|
@ -20,16 +20,16 @@ K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest
|
|||
|
||||
@dataclass
|
||||
class DatasetAttackConfigWholeDatasetKnnDistance(Config):
|
||||
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
|
||||
"""
|
||||
Configuration for DatasetAttackWholeDatasetKnnDistance.
|
||||
|
||||
Attributes:
|
||||
use_batches: Divide query samples into batches or not.
|
||||
batch_size: Query sample batch size.
|
||||
compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return
|
||||
one value indicating the distance between those vectors.
|
||||
See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
|
||||
distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
|
||||
sklearn.neighbors.NearestNeighbors documentation.
|
||||
:param use_batches: Divide query samples into batches or not.
|
||||
:param batch_size: Query sample batch size.
|
||||
:param compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must
|
||||
return one value indicating the distance between those vectors.
|
||||
See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
|
||||
:param distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
|
||||
sklearn.neighbors.NearestNeighbors documentation.
|
||||
"""
|
||||
use_batches: bool = False
|
||||
batch_size: int = 10
|
||||
|
|
@ -39,41 +39,40 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config):
|
|||
|
||||
@dataclass
|
||||
class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
|
||||
"""DatasetAttackWholeDatasetKnnDistance privacy risk score.
|
||||
"""
|
||||
DatasetAttackWholeDatasetKnnDistance privacy risk score.
|
||||
|
||||
:param dataset_name: Dataset name to be used in reports.
|
||||
:param share: The share of synthetic records closer to the training than the holdout dataset.
|
||||
A value of 0.5 or close to it means good privacy.
|
||||
"""
|
||||
share: float
|
||||
assessment_type: str = 'WholeDatasetKnnDistance' # to be used in reports
|
||||
|
||||
def __init__(self, dataset_name: str, share: float) -> None:
|
||||
"""
|
||||
dataset_name: dataset name to be used in reports
|
||||
share : the share of synthetic records closer to the training than the holdout dataset.
|
||||
A value of 0.5 or close to it means good privacy.
|
||||
"""
|
||||
super().__init__(dataset_name=dataset_name, risk_score=share, result=None)
|
||||
self.share = share
|
||||
|
||||
|
||||
class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
|
||||
"""
|
||||
Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
|
||||
members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
|
||||
records closer to the training than the holdout dataset.
|
||||
By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
|
||||
configuration instead.
|
||||
Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
|
||||
members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
|
||||
records closer to the training than the holdout dataset.
|
||||
By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
|
||||
configuration instead.
|
||||
|
||||
:param original_data_members: A container for the training original samples and labels.
|
||||
:param original_data_non_members: A container for the holdout original samples and labels.
|
||||
:param synthetic_data: A container for the synthetic samples and labels.
|
||||
:param config: Configuration parameters to guide the assessment process, optional.
|
||||
:param dataset_name: A name to identify this dataset, optional.
|
||||
"""
|
||||
|
||||
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset,
|
||||
config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(),
|
||||
dataset_name: str = DEFAULT_DATASET_NAME):
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels
|
||||
:param original_data_non_members: A container for the holdout original samples and labels
|
||||
:param synthetic_data: A container for the synthetic samples and labels
|
||||
:param config: Configuration parameters to guide the assessment process, optional
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
"""
|
||||
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
|
||||
super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
|
||||
attack_strategy_utils)
|
||||
|
|
@ -90,6 +89,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
|
|||
"""
|
||||
Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
|
||||
DCR computed by 'calculate_distances()'.
|
||||
|
||||
:return:
|
||||
score of the attack, based on the NN distances from the query samples to the synthetic data samples
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue