mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
Fix share calculation, and find only 1 KNN per sample for it
Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
parent
e5f6089b23
commit
185d9b9664
4 changed files with 48 additions and 37 deletions
|
|
@ -19,13 +19,11 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
|
|||
Common utilities for attack strategy based on KNN distances.
|
||||
"""
|
||||
|
||||
def __init__(self, k: int, use_batches: bool = False, batch_size: int = 10) -> None:
|
||||
def __init__(self, use_batches: bool = False, batch_size: int = 10) -> None:
|
||||
"""
|
||||
:param k: How many nearest neighbors to search
|
||||
:param use_batches: Use batches with a progress meter or not when finding KNNs for query set
|
||||
:param batch_size: if use_batches=True, the size of batch_size should be > 0
|
||||
"""
|
||||
self.k = k
|
||||
self.use_batches = use_batches
|
||||
self.batch_size = batch_size
|
||||
if use_batches:
|
||||
|
|
@ -49,7 +47,7 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
|
|||
"""
|
||||
samples = query_samples.get_samples()
|
||||
if not self.use_batches:
|
||||
distances, _ = knn_learner.kneighbors(samples, self.k, return_distance=True)
|
||||
distances, _ = knn_learner.kneighbors(samples, return_distance=True)
|
||||
if distance_processor:
|
||||
return distance_processor(distances)
|
||||
else:
|
||||
|
|
@ -61,7 +59,7 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
|
|||
x_batch = np.reshape(x_batch, [self.batch_size, -1])
|
||||
|
||||
# dist_batch: distance between every query sample in batch to its KNNs among training samples
|
||||
dist_batch, _ = knn_learner.kneighbors(x_batch, self.k, return_distance=True)
|
||||
dist_batch, _ = knn_learner.kneighbors(x_batch, return_distance=True)
|
||||
|
||||
# The probability of each sample to be generated
|
||||
if distance_processor:
|
||||
|
|
|
|||
|
|
@ -3,10 +3,12 @@ from typing import Optional
|
|||
|
||||
import pandas as pd
|
||||
|
||||
from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import DatasetAttackConfigPerRecordKnnProbabilities, \
|
||||
DatasetAttackPerRecordKnnProbabilities, DatasetAttackScorePerRecordKnnProbabilities
|
||||
from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import DatasetAttackConfigWholeDatasetKnnDistance, \
|
||||
DatasetAttackWholeDatasetKnnDistance, DatasetAttackScoreWholeDatasetKnnDistance
|
||||
from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \
|
||||
DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance, \
|
||||
DatasetAttackScoreWholeDatasetKnnDistance
|
||||
from apt.risk.data_assessment.per_record_knn_probabilities_dataset_attack_ import \
|
||||
DatasetAttackConfigPerRecordKnnProbabilities, DatasetAttackPerRecordKnnProbabilities, \
|
||||
DatasetAttackScorePerRecordKnnProbabilities
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
|
||||
|
||||
|
|
@ -32,7 +34,7 @@ class DatasetAssessmentManager:
|
|||
def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset, dataset_name: str = "dataset") -> (
|
||||
DatasetAttackScorePerRecordKnnProbabilities, DatasetAttackScoreWholeDatasetKnnDistance):
|
||||
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False, k=5)
|
||||
config_gl = DatasetAttackConfigPerRecordKnnProbabilities(use_batches=False)
|
||||
mgr = DatasetAttackPerRecordKnnProbabilities(original_data_members,
|
||||
original_data_non_members,
|
||||
synthetic_data,
|
||||
|
|
@ -43,7 +45,7 @@ class DatasetAssessmentManager:
|
|||
score_g = mgr.calculate_privacy_score(result, generate_plot=self.config.generate_plots)
|
||||
self.attack_scores_per_record_knn_probabilities.append(score_g)
|
||||
|
||||
config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False, k=5)
|
||||
config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)
|
||||
mgr_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, synthetic_data,
|
||||
dataset_name,
|
||||
config_h)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
"""
|
||||
This module implements privacy risk assessment of synthetic datasets based on the paper
|
||||
"Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer.
|
||||
This module implements privacy risk assessment of synthetic datasets based on the papers
|
||||
"Data Synthesis based on Generative Adversarial Networks." by N. Park, M. Mohammadi, K. Gorde, S. Jajodia, H. Park,
|
||||
and Y. Kim in International Conference on Very Large Data Bases (VLDB), 2018.
|
||||
and "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer.
|
||||
and on a variation of its reference implementation in https://github.com/mostly-ai/paper-fidelity-accuracy.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
|
|
@ -14,13 +16,14 @@ from apt.risk.data_assessment.dataset_attack import DatasetAttackWhole, Config
|
|||
from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
|
||||
K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest neighbor.
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatasetAttackConfigWholeDatasetKnnDistance(Config):
|
||||
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
|
||||
|
||||
Attributes:
|
||||
k: Number of nearest neighbors to search
|
||||
use_batches: Divide query samples into batches or not.
|
||||
batch_size: Query sample batch size.
|
||||
compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return
|
||||
|
|
@ -29,7 +32,6 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config):
|
|||
distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
|
||||
sklearn.neighbors.NearestNeighbors documentation.
|
||||
"""
|
||||
k: int = 1
|
||||
use_batches: bool = False
|
||||
batch_size: int = 10
|
||||
compute_distance: callable = None
|
||||
|
|
@ -41,7 +43,8 @@ class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
|
|||
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
|
||||
Attributes
|
||||
----------
|
||||
share : the share of synthetic records closer to the training than the holdout dataset
|
||||
share : the share of synthetic records closer to the training than the holdout dataset.
|
||||
A value of 0.5 or close to it means good privacy.
|
||||
assessment_type : assessment type is 'WholeDatasetKnnDistance', to be used in reports
|
||||
"""
|
||||
share: float
|
||||
|
|
@ -53,11 +56,14 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
|
|||
Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
|
||||
members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
|
||||
records closer to the training than the holdout dataset.
|
||||
By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
|
||||
configuration instead.
|
||||
"""
|
||||
|
||||
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset, dataset_name: str,
|
||||
config: Optional[DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
|
||||
config: Optional[
|
||||
DatasetAttackConfigWholeDatasetKnnDistance] = DatasetAttackConfigWholeDatasetKnnDistance()):
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels
|
||||
:param original_data_non_members: A container for the holdout original samples and labels
|
||||
|
|
@ -66,44 +72,47 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttackWhole):
|
|||
:param config: Configuration parameters to guide the assessment process such as which attack
|
||||
frameworks to use, optional
|
||||
"""
|
||||
attack_strategy_utils = KNNAttackStrategyUtils(config.k, config.use_batches, config.batch_size)
|
||||
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
|
||||
super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
|
||||
attack_strategy_utils, config)
|
||||
if config.compute_distance:
|
||||
self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto',
|
||||
metric=config.compute_distance,
|
||||
self.nn_obj_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance,
|
||||
metric_params=config.distance_params)
|
||||
self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto',
|
||||
metric=config.compute_distance,
|
||||
self.nn_obj_non_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance,
|
||||
metric_params=config.distance_params)
|
||||
else:
|
||||
self.nn_obj_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
|
||||
self.nn_obj_non_members = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
|
||||
self.nn_obj_members = NearestNeighbors(n_neighbors=K)
|
||||
self.nn_obj_non_members = NearestNeighbors(n_neighbors=K)
|
||||
|
||||
def assess_privacy(self) -> DatasetAttackScoreWholeDatasetKnnDistance:
|
||||
"""
|
||||
Calculate the share of synthetic records closer to the training than the holdout dataset
|
||||
Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
|
||||
DCR computed by 'calculate_distances()'.
|
||||
:return:
|
||||
:result of the attack, based on the NN distances from the query samples to the synthetic data samples
|
||||
"""
|
||||
member_distances, non_member_distances = self.calculate_distances()
|
||||
n_members = len(member_distances)
|
||||
n_non_members = len(non_member_distances)
|
||||
assert (n_members == n_non_members) # distance of the synth. records to members and to non-members
|
||||
# distance of the synth. records to members and to non-members
|
||||
assert (len(member_distances) == len(non_member_distances))
|
||||
n_members = len(self.original_data_members.get_samples())
|
||||
n_non_members = len(self.original_data_non_members.get_samples())
|
||||
|
||||
# percent of synth. records closer to members,
|
||||
# and half those whose distance is similar to members and non-members
|
||||
share = np.mean(member_distances < non_member_distances) + 0.5 * np.mean(
|
||||
share = np.mean(member_distances < non_member_distances) + (n_members / (n_members + n_non_members)) * np.mean(
|
||||
member_distances == non_member_distances)
|
||||
score = DatasetAttackScoreWholeDatasetKnnDistance(self.dataset_name, share=share)
|
||||
return score
|
||||
|
||||
def calculate_distances(self):
|
||||
"""
|
||||
Calculate positive and negative query probabilities, based on their distance to their KNNs among
|
||||
synthetic samples.
|
||||
Calculate positive and negative query probabilities, based on their distance to their KNN among
|
||||
synthetic samples. This distance is called distance to the closest record (DCR), as defined by
|
||||
N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks."
|
||||
|
||||
:return:
|
||||
pos_distances: distances of each synthetic data member from its nearest training samples
|
||||
neg_distances: distances of each synthetic data member from its nearest validation samples
|
||||
pos_distances: distances of each synthetic data member from its nearest training sample
|
||||
neg_distances: distances of each synthetic data member from its nearest validation sample
|
||||
"""
|
||||
# nearest neighbor search
|
||||
self.attack_strategy_utils.fit(self.original_data_members, self.nn_obj_members)
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ class DatasetAttackConfigPerRecordKnnProbabilities(Config):
|
|||
distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
|
||||
sklearn.neighbors.NearestNeighbors documentation.
|
||||
"""
|
||||
k: int = 1
|
||||
k: int = 5
|
||||
use_batches: bool = False
|
||||
batch_size: int = 10
|
||||
compute_distance: Callable = None
|
||||
|
|
@ -42,8 +42,8 @@ class DatasetAttackScorePerRecordKnnProbabilities(DatasetAttackScore):
|
|||
"""DatasetAttackPerRecordKnnProbabilities privacy score.
|
||||
Attributes
|
||||
----------
|
||||
roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
|
||||
performance.
|
||||
roc_auc_score : the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
|
||||
performance.
|
||||
average_precision_score: the proportion of Predicted Positive cases that are correctly Real Positives (members)
|
||||
assessment_type : assessment type is 'PerRecordKnnProbabilities', to be used in reports
|
||||
"""
|
||||
|
|
@ -56,6 +56,8 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
|
|||
"""
|
||||
Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of
|
||||
members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.
|
||||
By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
|
||||
configuration instead.
|
||||
The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure.
|
||||
"""
|
||||
|
||||
|
|
@ -70,7 +72,7 @@ class DatasetAttackPerRecordKnnProbabilities(DatasetAttackPerRecord):
|
|||
:param dataset_name: A name to identify this dataset
|
||||
:param config: Configuration parameters to guide the attack, optional
|
||||
"""
|
||||
attack_strategy_utils = KNNAttackStrategyUtils(config.k, config.use_batches, config.batch_size)
|
||||
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
|
||||
super().__init__(original_data_members, original_data_non_members, synthetic_data, dataset_name,
|
||||
attack_strategy_utils, config)
|
||||
if config.compute_distance:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue