Fix share calculation, and find only 1 KNN per sample for it

Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
Maya Anderson 2023-03-07 23:03:41 +02:00
parent e5f6089b23
commit 185d9b9664
4 changed files with 48 additions and 37 deletions

View file

@ -19,13 +19,11 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
Common utilities for attack strategy based on KNN distances.
"""
def __init__(self, k: int, use_batches: bool = False, batch_size: int = 10) -> None:
def __init__(self, use_batches: bool = False, batch_size: int = 10) -> None:
"""
:param k: How many nearest neighbors to search
:param use_batches: Use batches with a progress meter or not when finding KNNs for query set
:param batch_size: if use_batches=True, the size of batch_size should be > 0
"""
self.k = k
self.use_batches = use_batches
self.batch_size = batch_size
if use_batches:
@ -49,7 +47,7 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
"""
samples = query_samples.get_samples()
if not self.use_batches:
distances, _ = knn_learner.kneighbors(samples, self.k, return_distance=True)
distances, _ = knn_learner.kneighbors(samples, return_distance=True)
if distance_processor:
return distance_processor(distances)
else:
@ -61,7 +59,7 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
x_batch = np.reshape(x_batch, [self.batch_size, -1])
# dist_batch: distance between every query sample in batch to its KNNs among training samples
dist_batch, _ = knn_learner.kneighbors(x_batch, self.k, return_distance=True)
dist_batch, _ = knn_learner.kneighbors(x_batch, return_distance=True)
# The probability of each sample to be generated
if distance_processor:

View file

@ -3,10 +3,12 @@ from typing import Optional
import pandas as pd
from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import DatasetAttackConfigPerRecordKnnProbabilities, \
DatasetAttackPerRecordKnnProbabilities, DatasetAttackScorePerRecordKnnProbabilities
from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import DatasetAttackConfigWholeDatasetKnnDistance, \
DatasetAttackWholeDatasetKnnDistance, DatasetAttackScoreWholeDatasetKnnDistance
from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \
DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \
DatasetAttackScoreWholeDatasetKnnDistance
from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import \
DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \
DatasetAttackScorePerRecordKnnProbabilities
from apt.utils.datasets import ArrayDataset
@ -32,7 +34,7 @@ class DatasetAssessmentManager:
def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> (
DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance):
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False, k=5)
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False)
mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
original_data_non_members,
synthetic_data,
@ -43,7 +45,7 @@ class DatasetAssessmentManager:
score_g = mgr.calculate_privacy_score(result, generate_plot=self.config.generate_plots)
self.attack_scores_per_record_knn_probabilities.append(score_g)
config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False, k=5)
config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)
mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data,
dataset_name,
config_h)

View file

@ -1,6 +1,8 @@
"""
This module implements privacy risk assessment of synthetic datasets based on the paper
"Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer.
This module implements privacy risk assessment of synthetic datasets based on the papers
"Data Synthesis based on Generative Adversarial Networks." by N. Park, M. Mohammadi, K. Gorde, S. Jajodia, H. Park,
and Y. Kim in International Conference on Very Large Data Bases (VLDB), 2018.
and "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer.
and on a variation of its reference implementation in https://github.com/mostly-ai/paper-fidelity-accuracy.
"""
from dataclasses import dataclass
@ -14,13 +16,14 @@ from apt.risk.data_assessment.dataset_attack import DatasetAttackWhole, Config
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore
from apt.utils.datasets import ArrayDataset
K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest neighbor.
@dataclass
class DatasetAttackConfigWholeDatasetKnnDistance(Config):
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
Attributes:
k: Number of nearest neighbors to search
use_batches: Divide query samples into batches or not.
batch_size: Query sample batch size.
compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return
@ -29,7 +32,6 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config):
distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
sklearn.neighbors.NearestNeighbors documentation.
"""
k: int = 1
use_batches: bool = False
batch_size: int = 10
compute_distance: callable = None
@ -41,7 +43,8 @@ class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
Attributes
----------
share : the share of synthetic records closer to the training than the holdout dataset
share : the share of synthetic records closer to the training than the holdout dataset.
A value of 0.5 or close to it means good privacy.
assessment_type : assessment type is 'WholeDatasetKnnDistance', to be used in reports
"""
share: float
@ -53,11 +56,14 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
records closer to the training than the holdout dataset.
By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
configuration instead.
"""
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, dataset_name: str,
config: Optional[DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
config: Optional[
DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
@ -66,44 +72,47 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
:param config: Configuration parameters to guide the assessment process such as which attack
frameworks to use, optional
"""
attack_strategy_utils = KNNAttackStrategyUtils(config.k, config.use_batches, config.batch_size)
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
attack_strategy_utils, config)
if config.compute_distance:
self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto',
metric=config.compute_distance,
self.nn_obj_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance,
metric_params=config.distance_params)
self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto',
metric=config.compute_distance,
self.nn_obj_non_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance,
metric_params=config.distance_params)
else:
self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
self.nn_obj_members = NearestNeighbors(n_neighbors=K)
self.nn_obj_non_members = NearestNeighbors(n_neighbors=K)
def assess_privacy(self) -> DatasetAttackScoreWholeDatasetKnnDistance:
"""
Calculate the share of synthetic records closer to the training than the holdout dataset
Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
DCR computed by 'calculate_distances()'.
:return:
:result of the attack, based on the NN distances from the query samples to the synthetic data samples
"""
member_distances, non_member_distances = self.calculate_distances()
n_members = len(member_distances)
n_non_members = len(non_member_distances)
assert (n_members == n_non_members) # distance of the synth. records to members and to non-members
# distance of the synth. records to members and to non-members
assert (len(member_distances) == len(non_member_distances))
n_members = len(self.original_data_members.get_samples())
n_non_members = len(self.original_data_non_members.get_samples())
# percent of synth. records closer to members,
# and half those whose distance is similar to members and non-members
share = np.mean(member_distances < non_member_distances) + 0.5 * np.mean(
share = np.mean(member_distances < non_member_distances) + (n_members / (n_members + n_non_members)) * np.mean(
member_distances == non_member_distances)
score = DatasetAttackScoreWholeDatasetKnnDistance(self.dataset_name, share=share)
return score
def calculate_distances(self):
"""
Calculate positive and negative query probabilities, based on their distance to their KNNs among
synthetic samples.
Calculate positive and negative query probabilities, based on their distance to their KNN among
synthetic samples. This distance is called distance to the closest record (DCR), as defined by
N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks."
:return:
pos_distances: distances of each synthetic data member from its nearest training samples
neg_distances: distances of each synthetic data member from its nearest validation samples
pos_distances: distances of each synthetic data member from its nearest training sample
neg_distances: distances of each synthetic data member from its nearest validation sample
"""
# nearest neighbor search
self.attack_strategy_utils.fit(self.original_data_members, self.nn_obj_members)

View file

@ -30,7 +30,7 @@ class DatasetAttackConfigPerRecordKnnProbabilities(Config):
distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
sklearn.neighbors.NearestNeighbors documentation.
"""
k: int = 1
k: int = 5
use_batches: bool = False
batch_size: int = 10
compute_distance: Callable = None
@ -42,8 +42,8 @@ class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScore):
"""DatasetAttackPerRecordKnnProbabilities privacy score.
Attributes
----------
roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
performance.
roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
performance.
average_precision_score: the proportion of Predicted Positive cases that are correctly Real Positives (members)
assessment_type : assessment type is 'PerRecordKnnProbabilities', to be used in reports
"""
@ -56,6 +56,8 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
"""
Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of
members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.
By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
configuration instead.
The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure.
"""
@ -70,7 +72,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
:param dataset_name: A name to identify this dataset
:param config: Configuration parameters to guide the attack, optional
"""
attack_strategy_utils = KNNAttackStrategyUtils(config.k, config.use_batches, config.batch_size)
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
attack_strategy_utils, config)
if config.compute_distance: