Increase version to 0.2.0 (#74)

* Remove tensorflow dependency if not using keras model
* Remove xgboost dependency if not using xgboost model
* Documentation updates

Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
abigailgold 2023-05-08 12:50:55 +03:00 committed by GitHub
parent 782edabd58
commit 8a9ef80146
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
25 changed files with 306 additions and 152 deletions

View file

@ -9,21 +9,20 @@ from apt.utils.datasets import ArrayDataset
class AttackStrategyUtils(abc.ABC):
"""
Abstract base class for common utilities of various privacy attack strategies.
Abstract base class for common utilities of various privacy attack strategies.
"""
pass
class KNNAttackStrategyUtils(AttackStrategyUtils):
"""
Common utilities for attack strategy based on KNN distances.
Common utilities for attack strategy based on KNN distances.
:param use_batches: Use batches with a progress meter or not when finding KNNs for query set.
:param batch_size: if use_batches=True, the size of batch_size should be > 0.
"""
def __init__(self, use_batches: bool = False, batch_size: int = 10) -> None:
"""
:param use_batches: Use batches with a progress meter or not when finding KNNs for query set
:param batch_size: if use_batches=True, the size of batch_size should be > 0
"""
self.use_batches = use_batches
self.batch_size = batch_size
if use_batches:
@ -31,11 +30,18 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
raise ValueError(f"When using batching batch_size should be > 0, and not {batch_size}")
def fit(self, knn_learner: NearestNeighbors, dataset: ArrayDataset):
"""
Fit the KNN learner.
:param knn_learner: The KNN model to fit.
:param dataset: The training set to fit the model on.
"""
knn_learner.fit(dataset.get_samples())
def find_knn(self, knn_learner: NearestNeighbors, query_samples: ArrayDataset, distance_processor=None):
"""
Nearest neighbor search function.
:param query_samples: query samples, to which nearest neighbors are to be found
:param knn_learner: unsupervised learner for implementing neighbor searches, after it was fitted
:param distance_processor: function for processing the distance into another more relevant metric per sample.

View file

@ -15,6 +15,12 @@ from apt.utils.datasets import ArrayDataset
@dataclass
class DatasetAssessmentManagerConfig:
"""
Configuration for DatasetAssessmentManager.
:param persist_reports: Whether to save assessment results to filesystem.
:param generate_plots: Whether to generate and visualize plots as part of assessment.
"""
persist_reports: bool = False
generate_plots: bool = False
@ -22,14 +28,13 @@ class DatasetAssessmentManagerConfig:
class DatasetAssessmentManager:
"""
The main class for running dataset assessment attacks.
:param config: Configuration parameters to guide the dataset assessment process
"""
attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = []
attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = []
def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None:
"""
:param config: Configuration parameters to guide the dataset assessment process
"""
self.config = config
def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
@ -67,14 +72,17 @@ class DatasetAssessmentManager:
return [score_gl, score_h]
def dump_all_scores_to_files(self):
"""
Save assessment results to filesystem.
"""
if self.config.persist_reports:
results_log_file = "_results.log.csv"
self.dump_scores_to_file(self.attack_scores_per_record_knn_probabilities,
self._dump_scores_to_file(self.attack_scores_per_record_knn_probabilities,
"per_record_knn_probabilities" + results_log_file, True)
self.dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance,
self._dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance,
"whole_dataset_knn_distance" + results_log_file, True)
@staticmethod
def dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool):
def _dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool):
run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore') # don't serialize result
run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w') # Overwrite

View file

@ -16,32 +16,30 @@ from apt.utils.datasets import ArrayDataset
class Config(abc.ABC):
"""
The base class for dataset attack configurations
The base class for dataset attack configurations
"""
pass
class DatasetAttack(abc.ABC):
"""
The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model
training. The original data members (training data) and non-members (the holdout data) should be available.
For reliability, all the datasets should be preprocessed and normalized.
The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model
training. The original data members (training data) and non-members (the holdout data) should be available.
For reliability, all the datasets should be preprocessed and normalized.
:param original_data_members: A container for the training original samples and labels,
only samples are used in the assessment
:param original_data_non_members: A container for the holdout original samples and labels,
only samples are used in the assessment
:param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
:param config: Configuration parameters to guide the assessment process
:param dataset_name: A name to identify the dataset under attack, optional
:param attack_strategy_utils: Utils for use with the attack strategy, optional
"""
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset, config: Config, dataset_name: str,
attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None:
"""
:param original_data_members: A container for the training original samples and labels,
only samples are used in the assessment
:param original_data_non_members: A container for the holdout original samples and labels,
only samples are used in the assessment
:param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
:param config: Configuration parameters to guide the assessment process
:param dataset_name: A name to identify the dataset under attack, optional
:param attack_strategy_utils: Utils for use with the attack strategy, optional
"""
self.original_data_members = original_data_members
self.original_data_non_members = original_data_non_members
self.synthetic_data = synthetic_data
@ -52,7 +50,8 @@ class DatasetAttack(abc.ABC):
@abc.abstractmethod
def assess_privacy(self) -> DatasetAttackScore:
"""
Assess the privacy of the dataset
Assess the privacy of the dataset.
:return:
score: DatasetAttackScore the privacy attack risk score
"""
@ -61,14 +60,15 @@ class DatasetAttack(abc.ABC):
class DatasetAttackMembership(DatasetAttack):
"""
An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level.
An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level.
"""
@abc.abstractmethod
def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership,
generate_plot: bool = False) -> DatasetAttackScore:
"""
Calculate dataset privacy score based on the result of the privacy attack
Calculate dataset privacy score based on the result of the privacy attack.
:return:
score: DatasetAttackScore
"""
@ -78,15 +78,16 @@ class DatasetAttackMembership(DatasetAttack):
def plot_roc_curve(dataset_name: str, member_probabilities: np.ndarray, non_member_probabilities: np.ndarray,
filename_prefix: str = ""):
"""
Plot ROC curve
:param dataset_name: dataset name, will become part of the plot filename
:param member_probabilities: probability estimates of the member samples, the training data
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data
:param filename_prefix: name prefix for the ROC curve plot
Plot ROC curve.
:param dataset_name: dataset name, will become part of the plot filename.
:param member_probabilities: probability estimates of the member samples, the training data.
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data.
:param filename_prefix: name prefix for the ROC curve plot.
"""
labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities),))))
results = np.concatenate((non_member_probabilities, member_probabilities))
svc_disp = RocCurveDisplay.from_predictions(labels, results)
RocCurveDisplay.from_predictions(labels, results)
plt.plot([0, 1], [0, 1], color="navy", linewidth=2, linestyle="--", label='No skills')
plt.title('ROC curve')
plt.savefig(f'{filename_prefix}{dataset_name}_roc_curve.png')
@ -94,9 +95,10 @@ class DatasetAttackMembership(DatasetAttack):
@staticmethod
def calculate_metrics(member_probabilities: np.ndarray, non_member_probabilities: np.ndarray):
"""
Calculate attack performance metrics
:param member_probabilities: probability estimates of the member samples, the training data
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data
Calculate attack performance metrics.
:param member_probabilities: probability estimates of the member samples, the training data.
:param non_member_probabilities: probability estimates of the non-member samples, the hold-out data.
:return:
fpr: False Positive rate
tpr: True Positive rate

View file

@ -19,18 +19,18 @@ from apt.utils.datasets import ArrayDataset
@dataclass
class DatasetAttackConfigMembershipKnnProbabilities(Config):
"""Configuration for DatasetAttackMembershipKnnProbabilities.
"""
Configuration for DatasetAttackMembershipKnnProbabilities.
Attributes:
k: Number of nearest neighbors to search
use_batches: Divide query samples into batches or not.
batch_size: Query sample batch size.
compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return
one value indicating the distance between those vectors.
See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
sklearn.neighbors.NearestNeighbors documentation.
generate_plot: Generate or not an AUR ROC curve and persist it in a file
:param k: Number of nearest neighbors to search.
:param use_batches: Divide query samples into batches or not.
:param batch_size: Query sample batch size.
:param compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must
return one value indicating the distance between those vectors.
See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
:param distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
sklearn.neighbors.NearestNeighbors documentation.
:param generate_plot: Generate or not an AUR ROC curve and persist it in a file.
"""
k: int = 5
use_batches: bool = False
@ -42,7 +42,14 @@ class DatasetAttackConfigMembershipKnnProbabilities(Config):
@dataclass
class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore):
"""DatasetAttackMembershipKnnProbabilities privacy risk score.
"""
DatasetAttackMembershipKnnProbabilities privacy risk score.
:param dataset_name: dataset name to be used in reports
:param roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the
attack performance.
:param average_precision_score: the proportion of predicted members that are correctly members.
:param result: the result of the membership inference attack.
"""
roc_auc_score: float
average_precision_score: float
@ -50,13 +57,6 @@ class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore):
def __init__(self, dataset_name: str, roc_auc_score: float, average_precision_score: float,
result: DatasetAttackResultMembership) -> None:
"""
dataset_name: dataset name to be used in reports
roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack
performance.
average_precision_score: the proportion of predicted members that are correctly members
result: the result of the membership inference attack
"""
super().__init__(dataset_name=dataset_name, risk_score=roc_auc_score, result=result)
self.roc_auc_score = roc_auc_score
self.average_precision_score = average_precision_score
@ -64,24 +64,23 @@ class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore):
class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
"""
Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of
members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.
By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided
in configuration instead.
The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure.
Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of
members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.
By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided
in configuration instead.
The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure.
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
:param synthetic_data: A container for the synthetic samples and labels
:param config: Configuration parameters to guide the attack, optional
:param dataset_name: A name to identify this dataset, optional
"""
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset,
config: DatasetAttackConfigMembershipKnnProbabilities = DatasetAttackConfigMembershipKnnProbabilities(),
dataset_name: str = DEFAULT_DATASET_NAME):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
:param synthetic_data: A container for the synthetic samples and labels
:param config: Configuration parameters to guide the attack, optional
:param dataset_name: A name to identify this dataset, optional
"""
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
attack_strategy_utils)
@ -103,10 +102,9 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
by the Parzen window density estimation in ``probability_per_sample()``, computed from the NN distances from the
query samples to the synthetic data samples.
:return:
Privacy score of the attack together with the attack result with the probabilities of member and
non-member samples to be generated by the synthetic data generator based on the NN distances from the
query samples to the synthetic data samples
:return: Privacy score of the attack together with the attack result with the probabilities of member and
non-member samples to be generated by the synthetic data generator based on the NN distances from the
query samples to the synthetic data samples
"""
# nearest neighbor search
self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data)
@ -130,11 +128,11 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
"""
Evaluate privacy score from the probabilities of member and non-member samples to be generated by the synthetic
data generator. The probabilities are computed by the ``assess_privacy()`` method.
:param dataset_attack_result attack result containing probabilities of member and non-member samples to be
generated by the synthetic data generator
:param generate_plot generate AUC ROC curve plot and persist it
:return:
score of the attack, based on distance-based probabilities - mainly the ROC AUC score
:param dataset_attack_result: attack result containing probabilities of member and non-member samples to be
generated by the synthetic data generator.
:param generate_plot: generate AUC ROC curve plot and persist it.
:return: score of the attack, based on distance-based probabilities - mainly the ROC AUC score.
"""
member_proba, non_member_proba = \
dataset_attack_result.member_probabilities, dataset_attack_result.non_member_probabilities
@ -151,10 +149,10 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
"""
For every sample represented by its distance from the query sample to its KNN in synthetic data,
computes the probability of the synthetic data to be part of the query dataset.
:param distances: distance between every query sample in batch to its KNNs among synthetic samples, a numpy
array of size (n, k) with n being the number of samples, k - the number of KNNs
:return:
probability estimates of the query samples being generated and so - of being part of the synthetic set, a
numpy array of size (n,)
array of size (n, k) with n being the number of samples, k - the number of KNNs.
:return: probability estimates of the query samples being generated and so - of being part of the synthetic set,
a numpy array of size (n,)
"""
return np.average(np.exp(-distances), axis=1)

View file

@ -8,11 +8,21 @@ DEFAULT_DATASET_NAME = "dataset"
@dataclass
class DatasetAttackResult:
"""
Basic class for storing privacy risk assessment results.
"""
pass
@dataclass
class DatasetAttackScore:
"""
Basic class for storing privacy risk assessment scores.
:param dataset_name: The name of the dataset that was assessed.
:param risk_score: The privacy risk score.
:param result: An optional list of more detailed results.
"""
dataset_name: str
risk_score: float
result: Optional[DatasetAttackResult]
@ -20,5 +30,11 @@ class DatasetAttackScore:
@dataclass
class DatasetAttackResultMembership(DatasetAttackResult):
"""
Class for storing membership attack results.
:param member_probabilities: The attack probabilities for member samples.
:param non_member_probabilities: The attack probabilities for non-member samples.
"""
member_probabilities: np.ndarray
non_member_probabilities: np.ndarray

View file

@ -20,16 +20,16 @@ K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest
@dataclass
class DatasetAttackConfigWholeDatasetKnnDistance(Config):
"""Configuration for DatasetAttackWholeDatasetKnnDistance.
"""
Configuration for DatasetAttackWholeDatasetKnnDistance.
Attributes:
use_batches: Divide query samples into batches or not.
batch_size: Query sample batch size.
compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return
one value indicating the distance between those vectors.
See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
sklearn.neighbors.NearestNeighbors documentation.
:param use_batches: Divide query samples into batches or not.
:param batch_size: Query sample batch size.
:param compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must
return one value indicating the distance between those vectors.
See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation.
:param distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in
sklearn.neighbors.NearestNeighbors documentation.
"""
use_batches: bool = False
batch_size: int = 10
@ -39,41 +39,40 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config):
@dataclass
class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore):
"""DatasetAttackWholeDatasetKnnDistance privacy risk score.
"""
DatasetAttackWholeDatasetKnnDistance privacy risk score.
:param dataset_name: Dataset name to be used in reports.
:param share: The share of synthetic records closer to the training than the holdout dataset.
A value of 0.5 or close to it means good privacy.
"""
share: float
assessment_type: str = 'WholeDatasetKnnDistance' # to be used in reports
def __init__(self, dataset_name: str, share: float) -> None:
"""
dataset_name: dataset name to be used in reports
share : the share of synthetic records closer to the training than the holdout dataset.
A value of 0.5 or close to it means good privacy.
"""
super().__init__(dataset_name=dataset_name, risk_score=share, result=None)
self.share = share
class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
"""
Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
records closer to the training than the holdout dataset.
By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
configuration instead.
Privacy risk assessment for synthetic datasets based on distances of synthetic data records from
members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic
records closer to the training than the holdout dataset.
By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in
configuration instead.
:param original_data_members: A container for the training original samples and labels.
:param original_data_non_members: A container for the holdout original samples and labels.
:param synthetic_data: A container for the synthetic samples and labels.
:param config: Configuration parameters to guide the assessment process, optional.
:param dataset_name: A name to identify this dataset, optional.
"""
def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
synthetic_data: ArrayDataset,
config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(),
dataset_name: str = DEFAULT_DATASET_NAME):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
:param synthetic_data: A container for the synthetic samples and labels
:param config: Configuration parameters to guide the assessment process, optional
:param dataset_name: A name to identify this dataset, optional
"""
attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size)
super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name,
attack_strategy_utils)
@ -90,6 +89,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
"""
Calculate the share of synthetic records closer to the training than the holdout dataset, based on the
DCR computed by 'calculate_distances()'.
:return:
score of the attack, based on the NN distances from the query samples to the synthetic data samples
"""