From 8a9ef801467c44f054c54ce12d29006714dc7fde Mon Sep 17 00:00:00 2001 From: abigailgold <57357634+abigailgold@users.noreply.github.com> Date: Mon, 8 May 2023 12:50:55 +0300 Subject: [PATCH] Increase version to 0.2.0 (#74) * Remove tensorflow dependency if not using keras model * Remove xgboost dependency if not using xgboost model * Documentation updates Signed-off-by: abigailt --- apt/__init__.py | 2 +- apt/risk/__init__.py | 1 + .../data_assessment/attack_strategy_utils.py | 18 ++-- .../dataset_assessment_manager.py | 20 +++-- apt/risk/data_assessment/dataset_attack.py | 56 ++++++------ ...set_attack_membership_knn_probabilities.py | 86 +++++++++---------- .../data_assessment/dataset_attack_result.py | 16 ++++ ...taset_attack_whole_dataset_knn_distance.py | 54 ++++++------ apt/utils/models/keras_model.py | 9 +- apt/utils/models/pytorch_model.py | 68 +++++++++------ apt/utils/models/xgboost_model.py | 3 +- docs/conf.py | 4 +- docs/index.rst | 2 + docs/source/apt.anonymization.rst | 1 - docs/source/apt.minimization.rst | 1 - docs/source/apt.risk.data_assessment.rst | 61 +++++++++++++ docs/source/apt.risk.rst | 18 ++++ docs/source/apt.rst | 1 + docs/source/apt.utils.datasets.rst | 1 - docs/source/apt.utils.models.rst | 23 +++++ docs/source/apt.utils.rst | 1 - requirements.txt | 3 + setup.cfg | 2 +- tests/test_minimizer.py | 3 + tests/test_model.py | 4 +- 25 files changed, 306 insertions(+), 152 deletions(-) create mode 100644 apt/risk/__init__.py create mode 100644 docs/source/apt.risk.data_assessment.rst create mode 100644 docs/source/apt.risk.rst diff --git a/apt/__init__.py b/apt/__init__.py index ae1d0d0..7aba4bf 100644 --- a/apt/__init__.py +++ b/apt/__init__.py @@ -6,4 +6,4 @@ from apt import anonymization from apt import minimization from apt import utils -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/apt/risk/__init__.py b/apt/risk/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/apt/risk/__init__.py @@ -0,0 +1 @@ + diff --git a/apt/risk/data_assessment/attack_strategy_utils.py b/apt/risk/data_assessment/attack_strategy_utils.py index 674feff..b0be8a1 100644 --- a/apt/risk/data_assessment/attack_strategy_utils.py +++ b/apt/risk/data_assessment/attack_strategy_utils.py @@ -9,21 +9,20 @@ from apt.utils.datasets import ArrayDataset class AttackStrategyUtils(abc.ABC): """ - Abstract base class for common utilities of various privacy attack strategies. + Abstract base class for common utilities of various privacy attack strategies. """ pass class KNNAttackStrategyUtils(AttackStrategyUtils): """ - Common utilities for attack strategy based on KNN distances. + Common utilities for attack strategy based on KNN distances. + + :param use_batches: Use batches with a progress meter or not when finding KNNs for query set. + :param batch_size: if use_batches=True, the size of batch_size should be > 0. """ def __init__(self, use_batches: bool = False, batch_size: int = 10) -> None: - """ - :param use_batches: Use batches with a progress meter or not when finding KNNs for query set - :param batch_size: if use_batches=True, the size of batch_size should be > 0 - """ self.use_batches = use_batches self.batch_size = batch_size if use_batches: @@ -31,11 +30,18 @@ class KNNAttackStrategyUtils(AttackStrategyUtils): raise ValueError(f"When using batching batch_size should be > 0, and not {batch_size}") def fit(self, knn_learner: NearestNeighbors, dataset: ArrayDataset): + """ + Fit the KNN learner. + + :param knn_learner: The KNN model to fit. + :param dataset: The training set to fit the model on. + """ knn_learner.fit(dataset.get_samples()) def find_knn(self, knn_learner: NearestNeighbors, query_samples: ArrayDataset, distance_processor=None): """ Nearest neighbor search function. + :param query_samples: query samples, to which nearest neighbors are to be found :param knn_learner: unsupervised learner for implementing neighbor searches, after it was fitted :param distance_processor: function for processing the distance into another more relevant metric per sample. diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py index 78beeef..a94fe70 100644 --- a/apt/risk/data_assessment/dataset_assessment_manager.py +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -15,6 +15,12 @@ from apt.utils.datasets import ArrayDataset @dataclass class DatasetAssessmentManagerConfig: + """ + Configuration for DatasetAssessmentManager. + + :param persist_reports: Whether to save assessment results to filesystem. + :param generate_plots: Whether to generate and visualize plots as part of assessment. + """ persist_reports: bool = False generate_plots: bool = False @@ -22,14 +28,13 @@ class DatasetAssessmentManagerConfig: class DatasetAssessmentManager: """ The main class for running dataset assessment attacks. + + :param config: Configuration parameters to guide the dataset assessment process """ attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = [] attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = [] def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None: - """ - :param config: Configuration parameters to guide the dataset assessment process - """ self.config = config def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, @@ -67,14 +72,17 @@ class DatasetAssessmentManager: return [score_gl, score_h] def dump_all_scores_to_files(self): + """ + Save assessment results to filesystem. + """ if self.config.persist_reports: results_log_file = "_results.log.csv" - self.dump_scores_to_file(self.attack_scores_per_record_knn_probabilities, + self._dump_scores_to_file(self.attack_scores_per_record_knn_probabilities, "per_record_knn_probabilities" + results_log_file, True) - self.dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance, + self._dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance, "whole_dataset_knn_distance" + results_log_file, True) @staticmethod - def dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool): + def _dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool): run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore') # don't serialize result run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w') # Overwrite diff --git a/apt/risk/data_assessment/dataset_attack.py b/apt/risk/data_assessment/dataset_attack.py index e057c8a..76b6330 100644 --- a/apt/risk/data_assessment/dataset_attack.py +++ b/apt/risk/data_assessment/dataset_attack.py @@ -16,32 +16,30 @@ from apt.utils.datasets import ArrayDataset class Config(abc.ABC): """ - The base class for dataset attack configurations + The base class for dataset attack configurations """ pass class DatasetAttack(abc.ABC): """ - The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model - training. The original data members (training data) and non-members (the holdout data) should be available. - For reliability, all the datasets should be preprocessed and normalized. + The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model + training. The original data members (training data) and non-members (the holdout data) should be available. + For reliability, all the datasets should be preprocessed and normalized. + + :param original_data_members: A container for the training original samples and labels, + only samples are used in the assessment + :param original_data_non_members: A container for the holdout original samples and labels, + only samples are used in the assessment + :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment + :param config: Configuration parameters to guide the assessment process + :param dataset_name: A name to identify the dataset under attack, optional + :param attack_strategy_utils: Utils for use with the attack strategy, optional """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, config: Config, dataset_name: str, attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None: - """ - :param original_data_members: A container for the training original samples and labels, - only samples are used in the assessment - :param original_data_non_members: A container for the holdout original samples and labels, - only samples are used in the assessment - :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment - :param config: Configuration parameters to guide the assessment process - :param dataset_name: A name to identify the dataset under attack, optional - :param attack_strategy_utils: Utils for use with the attack strategy, optional - """ - self.original_data_members = original_data_members self.original_data_non_members = original_data_non_members self.synthetic_data = synthetic_data @@ -52,7 +50,8 @@ class DatasetAttack(abc.ABC): @abc.abstractmethod def assess_privacy(self) -> DatasetAttackScore: """ - Assess the privacy of the dataset + Assess the privacy of the dataset. + :return: score: DatasetAttackScore the privacy attack risk score """ @@ -61,14 +60,15 @@ class DatasetAttack(abc.ABC): class DatasetAttackMembership(DatasetAttack): """ - An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level. + An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level. """ @abc.abstractmethod def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership, generate_plot: bool = False) -> DatasetAttackScore: """ - Calculate dataset privacy score based on the result of the privacy attack + Calculate dataset privacy score based on the result of the privacy attack. + :return: score: DatasetAttackScore """ @@ -78,15 +78,16 @@ class DatasetAttackMembership(DatasetAttack): def plot_roc_curve(dataset_name: str, member_probabilities: np.ndarray, non_member_probabilities: np.ndarray, filename_prefix: str = ""): """ - Plot ROC curve - :param dataset_name: dataset name, will become part of the plot filename - :param member_probabilities: probability estimates of the member samples, the training data - :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data - :param filename_prefix: name prefix for the ROC curve plot + Plot ROC curve. + + :param dataset_name: dataset name, will become part of the plot filename. + :param member_probabilities: probability estimates of the member samples, the training data. + :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data. + :param filename_prefix: name prefix for the ROC curve plot. """ labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities),)))) results = np.concatenate((non_member_probabilities, member_probabilities)) - svc_disp = RocCurveDisplay.from_predictions(labels, results) + RocCurveDisplay.from_predictions(labels, results) plt.plot([0, 1], [0, 1], color="navy", linewidth=2, linestyle="--", label='No skills') plt.title('ROC curve') plt.savefig(f'{filename_prefix}{dataset_name}_roc_curve.png') @@ -94,9 +95,10 @@ class DatasetAttackMembership(DatasetAttack): @staticmethod def calculate_metrics(member_probabilities: np.ndarray, non_member_probabilities: np.ndarray): """ - Calculate attack performance metrics - :param member_probabilities: probability estimates of the member samples, the training data - :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data + Calculate attack performance metrics. + + :param member_probabilities: probability estimates of the member samples, the training data. + :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data. :return: fpr: False Positive rate tpr: True Positive rate diff --git a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py index 7779b17..374ff18 100644 --- a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py +++ b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py @@ -19,18 +19,18 @@ from apt.utils.datasets import ArrayDataset @dataclass class DatasetAttackConfigMembershipKnnProbabilities(Config): - """Configuration for DatasetAttackMembershipKnnProbabilities. + """ + Configuration for DatasetAttackMembershipKnnProbabilities. - Attributes: - k: Number of nearest neighbors to search - use_batches: Divide query samples into batches or not. - batch_size: Query sample batch size. - compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return - one value indicating the distance between those vectors. - See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. - distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in - sklearn.neighbors.NearestNeighbors documentation. - generate_plot: Generate or not an AUR ROC curve and persist it in a file + :param k: Number of nearest neighbors to search. + :param use_batches: Divide query samples into batches or not. + :param batch_size: Query sample batch size. + :param compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must + return one value indicating the distance between those vectors. + See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. + :param distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in + sklearn.neighbors.NearestNeighbors documentation. + :param generate_plot: Generate or not an AUR ROC curve and persist it in a file. """ k: int = 5 use_batches: bool = False @@ -42,7 +42,14 @@ class DatasetAttackConfigMembershipKnnProbabilities(Config): @dataclass class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore): - """DatasetAttackMembershipKnnProbabilities privacy risk score. + """ + DatasetAttackMembershipKnnProbabilities privacy risk score. + + :param dataset_name: dataset name to be used in reports + :param roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the + attack performance. + :param average_precision_score: the proportion of predicted members that are correctly members. + :param result: the result of the membership inference attack. """ roc_auc_score: float average_precision_score: float @@ -50,13 +57,6 @@ class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore): def __init__(self, dataset_name: str, roc_auc_score: float, average_precision_score: float, result: DatasetAttackResultMembership) -> None: - """ - dataset_name: dataset name to be used in reports - roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack - performance. - average_precision_score: the proportion of predicted members that are correctly members - result: the result of the membership inference attack - """ super().__init__(dataset_name=dataset_name, risk_score=roc_auc_score, result=result) self.roc_auc_score = roc_auc_score self.average_precision_score = average_precision_score @@ -64,24 +64,23 @@ class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore): class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): """ - Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of - members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset. - By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided - in configuration instead. - The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure. + Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of + members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset. + By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided + in configuration instead. + The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure. + + :param original_data_members: A container for the training original samples and labels + :param original_data_non_members: A container for the holdout original samples and labels + :param synthetic_data: A container for the synthetic samples and labels + :param config: Configuration parameters to guide the attack, optional + :param dataset_name: A name to identify this dataset, optional """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, config: DatasetAttackConfigMembershipKnnProbabilities = DatasetAttackConfigMembershipKnnProbabilities(), dataset_name: str = DEFAULT_DATASET_NAME): - """ - :param original_data_members: A container for the training original samples and labels - :param original_data_non_members: A container for the holdout original samples and labels - :param synthetic_data: A container for the synthetic samples and labels - :param config: Configuration parameters to guide the attack, optional - :param dataset_name: A name to identify this dataset, optional - """ attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name, attack_strategy_utils) @@ -103,10 +102,9 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): by the Parzen window density estimation in ``probability_per_sample()``, computed from the NN distances from the query samples to the synthetic data samples. - :return: - Privacy score of the attack together with the attack result with the probabilities of member and - non-member samples to be generated by the synthetic data generator based on the NN distances from the - query samples to the synthetic data samples + :return: Privacy score of the attack together with the attack result with the probabilities of member and + non-member samples to be generated by the synthetic data generator based on the NN distances from the + query samples to the synthetic data samples """ # nearest neighbor search self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data) @@ -130,11 +128,11 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): """ Evaluate privacy score from the probabilities of member and non-member samples to be generated by the synthetic data generator. The probabilities are computed by the ``assess_privacy()`` method. - :param dataset_attack_result attack result containing probabilities of member and non-member samples to be - generated by the synthetic data generator - :param generate_plot generate AUC ROC curve plot and persist it - :return: - score of the attack, based on distance-based probabilities - mainly the ROC AUC score + + :param dataset_attack_result: attack result containing probabilities of member and non-member samples to be + generated by the synthetic data generator. + :param generate_plot: generate AUC ROC curve plot and persist it. + :return: score of the attack, based on distance-based probabilities - mainly the ROC AUC score. """ member_proba, non_member_proba = \ dataset_attack_result.member_probabilities, dataset_attack_result.non_member_probabilities @@ -151,10 +149,10 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): """ For every sample represented by its distance from the query sample to its KNN in synthetic data, computes the probability of the synthetic data to be part of the query dataset. + :param distances: distance between every query sample in batch to its KNNs among synthetic samples, a numpy - array of size (n, k) with n being the number of samples, k - the number of KNNs - :return: - probability estimates of the query samples being generated and so - of being part of the synthetic set, a - numpy array of size (n,) + array of size (n, k) with n being the number of samples, k - the number of KNNs. + :return: probability estimates of the query samples being generated and so - of being part of the synthetic set, + a numpy array of size (n,) """ return np.average(np.exp(-distances), axis=1) diff --git a/apt/risk/data_assessment/dataset_attack_result.py b/apt/risk/data_assessment/dataset_attack_result.py index 0ed0bd4..afd4b36 100644 --- a/apt/risk/data_assessment/dataset_attack_result.py +++ b/apt/risk/data_assessment/dataset_attack_result.py @@ -8,11 +8,21 @@ DEFAULT_DATASET_NAME = "dataset" @dataclass class DatasetAttackResult: + """ + Basic class for storing privacy risk assessment results. + """ pass @dataclass class DatasetAttackScore: + """ + Basic class for storing privacy risk assessment scores. + + :param dataset_name: The name of the dataset that was assessed. + :param risk_score: The privacy risk score. + :param result: An optional list of more detailed results. + """ dataset_name: str risk_score: float result: Optional[DatasetAttackResult] @@ -20,5 +30,11 @@ class DatasetAttackScore: @dataclass class DatasetAttackResultMembership(DatasetAttackResult): + """ + Class for storing membership attack results. + + :param member_probabilities: The attack probabilities for member samples. + :param non_member_probabilities: The attack probabilities for non-member samples. + """ member_probabilities: np.ndarray non_member_probabilities: np.ndarray diff --git a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py index 1a57bbd..6dea1d5 100644 --- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -20,16 +20,16 @@ K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest @dataclass class DatasetAttackConfigWholeDatasetKnnDistance(Config): - """Configuration for DatasetAttackWholeDatasetKnnDistance. + """ + Configuration for DatasetAttackWholeDatasetKnnDistance. - Attributes: - use_batches: Divide query samples into batches or not. - batch_size: Query sample batch size. - compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return - one value indicating the distance between those vectors. - See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. - distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in - sklearn.neighbors.NearestNeighbors documentation. + :param use_batches: Divide query samples into batches or not. + :param batch_size: Query sample batch size. + :param compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must + return one value indicating the distance between those vectors. + See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. + :param distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in + sklearn.neighbors.NearestNeighbors documentation. """ use_batches: bool = False batch_size: int = 10 @@ -39,41 +39,40 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config): @dataclass class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore): - """DatasetAttackWholeDatasetKnnDistance privacy risk score. + """ + DatasetAttackWholeDatasetKnnDistance privacy risk score. + + :param dataset_name: Dataset name to be used in reports. + :param share: The share of synthetic records closer to the training than the holdout dataset. + A value of 0.5 or close to it means good privacy. """ share: float assessment_type: str = 'WholeDatasetKnnDistance' # to be used in reports def __init__(self, dataset_name: str, share: float) -> None: - """ - dataset_name: dataset name to be used in reports - share : the share of synthetic records closer to the training than the holdout dataset. - A value of 0.5 or close to it means good privacy. - """ super().__init__(dataset_name=dataset_name, risk_score=share, result=None) self.share = share class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): """ - Privacy risk assessment for synthetic datasets based on distances of synthetic data records from - members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic - records closer to the training than the holdout dataset. - By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in - configuration instead. + Privacy risk assessment for synthetic datasets based on distances of synthetic data records from + members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic + records closer to the training than the holdout dataset. + By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in + configuration instead. + + :param original_data_members: A container for the training original samples and labels. + :param original_data_non_members: A container for the holdout original samples and labels. + :param synthetic_data: A container for the synthetic samples and labels. + :param config: Configuration parameters to guide the assessment process, optional. + :param dataset_name: A name to identify this dataset, optional. """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(), dataset_name: str = DEFAULT_DATASET_NAME): - """ - :param original_data_members: A container for the training original samples and labels - :param original_data_non_members: A container for the holdout original samples and labels - :param synthetic_data: A container for the synthetic samples and labels - :param config: Configuration parameters to guide the assessment process, optional - :param dataset_name: A name to identify this dataset, optional - """ attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name, attack_strategy_utils) @@ -90,6 +89,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): """ Calculate the share of synthetic records closer to the training than the holdout dataset, based on the DCR computed by 'calculate_distances()'. + :return: score of the attack, based on the NN distances from the query samples to the synthetic data samples """ diff --git a/apt/utils/models/keras_model.py b/apt/utils/models/keras_model.py index 4f64ed7..6f89a35 100644 --- a/apt/utils/models/keras_model.py +++ b/apt/utils/models/keras_model.py @@ -2,9 +2,6 @@ from typing import Optional import numpy as np -import tensorflow as tf -from tensorflow import keras - from sklearn.metrics import mean_squared_error from apt.utils.models import Model, ModelOutputType, ScoringMethod, check_correct_model_output @@ -14,8 +11,6 @@ from art.utils import check_and_transform_label_format from art.estimators.classification.keras import KerasClassifier as ArtKerasClassifier from art.estimators.regression.keras import KerasRegressor as ArtKerasRegressor -tf.compat.v1.disable_eager_execution() - class KerasModel(Model): """ @@ -41,7 +36,7 @@ class KerasClassifier(KerasModel): queries that can be submitted. Default is True. :type unlimited_queries: boolean, optional """ - def __init__(self, model: keras.models.Model, output_type: ModelOutputType, black_box_access: Optional[bool] = True, + def __init__(self, model: "keras.models.Model", output_type: ModelOutputType, black_box_access: Optional[bool] = True, unlimited_queries: Optional[bool] = True, **kwargs): super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs) logits = False @@ -107,7 +102,7 @@ class KerasRegressor(KerasModel): queries that can be submitted. Default is True. :type unlimited_queries: boolean, optional """ - def __init__(self, model: keras.models.Model, black_box_access: Optional[bool] = True, + def __init__(self, model: "keras.models.Model", black_box_access: Optional[bool] = True, unlimited_queries: Optional[bool] = True, **kwargs): super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs) self._art_model = ArtKerasRegressor(model) diff --git a/apt/utils/models/pytorch_model.py b/apt/utils/models/pytorch_model.py index 3e8b550..a97fd33 100644 --- a/apt/utils/models/pytorch_model.py +++ b/apt/utils/models/pytorch_model.py @@ -31,7 +31,9 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): """ def get_step_correct(self, outputs, targets) -> int: - """get number of correctly classified labels""" + """ + Get number of correctly classified labels. + """ if len(outputs) != len(targets): raise ValueError("outputs and targets should be the same length.") if self.nb_classes > 1: @@ -40,7 +42,9 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): return int(torch.sum(torch.round(outputs, axis=-1) == targets).item()) def _eval(self, loader: DataLoader): - """inner function for model evaluation""" + """ + Inner function for model evaluation. + """ self.model.eval() total_loss = 0 @@ -74,19 +78,20 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): ) -> None: """ Fit the classifier on the training set `(x, y)`. + :param x: Training data. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or index labels - of shape (nb_samples,). + of shape (nb_samples,). :param x_validation: Validation data (optional). :param y_validation: Target validation values (class labels) one-hot-encoded of shape - (nb_samples, nb_classes) or index labels of shape (nb_samples,) (optional). + (nb_samples, nb_classes) or index labels of shape (nb_samples,) (optional). :param batch_size: Size of batches. :param nb_epochs: Number of epochs to use for training. :param save_checkpoints: Boolean, save checkpoints if True. :param save_entire_model: Boolean, save entire model if True, else save state dict. :param path: path for saving checkpoint. :param kwargs: Dictionary of framework-specific arguments. This parameter is not currently - supported for PyTorch and providing it takes no effect. + supported for PyTorch and providing it takes no effect. """ # Put the model in the training mode self._model.train() @@ -153,7 +158,8 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def save_checkpoint_state_dict(self, is_best: bool, path=os.getcwd(), filename="latest.tar") -> None: """ - Saves checkpoint as latest.tar or best.tar + Saves checkpoint as latest.tar or best.tar. + :param is_best: whether the model is the best achieved model :param path: path for saving checkpoint :param filename: checkpoint name @@ -176,7 +182,8 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def save_checkpoint_model(self, is_best: bool, path=os.getcwd(), filename="latest.tar") -> None: """ - Saves checkpoint as latest.tar or best.tar + Saves checkpoint as latest.tar or best.tar. + :param is_best: whether the model is the best achieved model :param path: path for saving checkpoint :param filename: checkpoint name @@ -194,7 +201,8 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def load_checkpoint_state_dict_by_path(self, model_name: str, path: str = None): """ - Load model only based on the check point path + Load model only based on the check point path. + :param model_name: check point filename :param path: checkpoint path (default current work dir) :return: loaded model @@ -219,21 +227,24 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def load_latest_state_dict_checkpoint(self): """ - Load model state dict only based on the check point path (latest.tar) + Load model state dict only based on the check point path (latest.tar). + :return: loaded model """ self.load_checkpoint_state_dict_by_path("latest.tar") def load_best_state_dict_checkpoint(self): """ - Load model state dict only based on the check point path (model_best.tar) + Load model state dict only based on the check point path (model_best.tar). + :return: loaded model """ self.load_checkpoint_state_dict_by_path("model_best.tar") def load_checkpoint_model_by_path(self, model_name: str, path: str = None): """ - Load model only based on the check point path + Load model only based on the check point path. + :param model_name: check point filename :param path: checkpoint path (default current work dir) :return: loaded model @@ -254,14 +265,16 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def load_latest_model_checkpoint(self): """ - Load entire model only based on the check point path (latest.tar) + Load entire model only based on the check point path (latest.tar). + :return: loaded model """ self.load_checkpoint_model_by_path("latest.tar") def load_best_model_checkpoint(self): """ - Load entire model only based on the check point path (model_best.tar) + Load entire model only based on the check point path (model_best.tar). + :return: loaded model """ self.load_checkpoint_model_by_path("model_best.tar") @@ -288,11 +301,11 @@ class PyTorchClassifier(PyTorchModel): Initialization specifically for the PyTorch-based implementation. :param model: PyTorch model. The output of the model can be logits, probabilities or anything else. Logits - output should be preferred where possible to ensure attack efficiency. + output should be preferred where possible to ensure attack efficiency. :param output_type: The type of output the model yields (vector/label only for classifiers, value for regressors) :param loss: The loss function for which to compute gradients for training. The target label must be raw - categorical, i.e. not converted to one-hot encoding. + categorical, i.e. not converted to one-hot encoding. :param input_shape: The shape of one input instance. :param optimizer: The optimizer used to train the classifier. :param black_box_access: Boolean describing the type of deployment of the model (when in production). @@ -311,7 +324,7 @@ class PyTorchClassifier(PyTorchModel): @property def loss(self): """ - The pytorch model's loss function + The pytorch model's loss function. :return: The pytorch model's loss function """ @@ -320,7 +333,7 @@ class PyTorchClassifier(PyTorchModel): @property def optimizer(self): """ - The pytorch model's optimizer + The pytorch model's optimizer. :return: The pytorch model's optimizer """ @@ -350,7 +363,7 @@ class PyTorchClassifier(PyTorchModel): :param save_entire_model: Boolean, save entire model if True, else save state dict. :param path: path for saving checkpoint. :param kwargs: Dictionary of framework-specific arguments. This parameter is not currently - supported for PyTorch and providing it takes no effect. + supported for PyTorch and providing it takes no effect. """ if validation_data is None: self._art_model.fit( @@ -390,6 +403,7 @@ class PyTorchClassifier(PyTorchModel): def score(self, test_data: PytorchData, **kwargs): """ Score the model using test data. + :param test_data: Test data. :type test_data: `PytorchData` :return: the score as float (between 0 and 1) @@ -400,7 +414,8 @@ class PyTorchClassifier(PyTorchModel): def load_checkpoint_state_dict_by_path(self, model_name: str, path: str = None): """ - Load model only based on the check point path + Load model only based on the check point path. + :param model_name: check point filename :param path: checkpoint path (default current work dir) :return: loaded model @@ -409,21 +424,24 @@ class PyTorchClassifier(PyTorchModel): def load_latest_state_dict_checkpoint(self): """ - Load model state dict only based on the check point path (latest.tar) + Load model state dict only based on the check point path (latest.tar). + :return: loaded model """ self._art_model.load_latest_state_dict_checkpoint() def load_best_state_dict_checkpoint(self): """ - Load model state dict only based on the check point path (model_best.tar) + Load model state dict only based on the check point path (model_best.tar). + :return: loaded model """ self._art_model.load_best_state_dict_checkpoint() def load_checkpoint_model_by_path(self, model_name: str, path: str = None): """ - Load model only based on the check point path + Load model only based on the check point path. + :param model_name: check point filename :param path: checkpoint path (default current work dir) :return: loaded model @@ -432,14 +450,16 @@ class PyTorchClassifier(PyTorchModel): def load_latest_model_checkpoint(self): """ - Load entire model only based on the check point path (latest.tar) + Load entire model only based on the check point path (latest.tar). + :return: loaded model """ self._art_model.load_latest_model_checkpoint() def load_best_model_checkpoint(self): """ - Load entire model only based on the check point path (model_best.tar) + Load entire model only based on the check point path (model_best.tar). + :return: loaded model """ self._art_model.load_best_model_checkpoint() diff --git a/apt/utils/models/xgboost_model.py b/apt/utils/models/xgboost_model.py index 2fdc9fe..85f9a89 100644 --- a/apt/utils/models/xgboost_model.py +++ b/apt/utils/models/xgboost_model.py @@ -3,7 +3,6 @@ from typing import Optional, Tuple from apt.utils.models import Model, ModelOutputType, ScoringMethod, check_correct_model_output, is_one_hot from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE -from xgboost import XGBClassifier import numpy as np from art.estimators.classification.xgboost import XGBoostClassifier as ArtXGBoostClassifier @@ -37,7 +36,7 @@ class XGBoostClassifier(XGBoostModel): queries that can be submitted. Default is True. :type unlimited_queries: boolean, optional """ - def __init__(self, model: XGBClassifier, output_type: ModelOutputType, input_shape: Tuple[int, ...], + def __init__(self, model: "xgboost.XGBClassifier", output_type: ModelOutputType, input_shape: Tuple[int, ...], nb_classes: int, black_box_access: Optional[bool] = True, unlimited_queries: Optional[bool] = True, **kwargs): super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs) diff --git a/docs/conf.py b/docs/conf.py index aa505c2..d0da43d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ copyright = '2021, IBM' author = 'Abigail Goldsteen' # The full version, including alpha/beta/rc tags -release = '0.1.0' +release = '0.2.0' master_doc = 'index' @@ -53,7 +53,7 @@ exclude_patterns = [] # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'pyramid' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/index.rst b/docs/index.rst index 6a1969d..0d26e63 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,6 +18,8 @@ minimization principle in GDPR for ML models. It enables to reduce the amount of personal data needed to perform predictions with a machine learning model, while still enabling the model to make accurate predictions. This is done by by removing or generalizing some of the input features. +The dataset risk assessment module implements a tool for privacy assessment of synthetic datasets that are to be used in AI model training. + .. toctree:: :maxdepth: 2 :caption: Getting Started: diff --git a/docs/source/apt.anonymization.rst b/docs/source/apt.anonymization.rst index 6453554..f0aea69 100644 --- a/docs/source/apt.anonymization.rst +++ b/docs/source/apt.anonymization.rst @@ -12,7 +12,6 @@ apt.anonymization.anonymizer module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/apt.minimization.rst b/docs/source/apt.minimization.rst index 417fc68..a84364e 100644 --- a/docs/source/apt.minimization.rst +++ b/docs/source/apt.minimization.rst @@ -12,7 +12,6 @@ apt.minimization.minimizer module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/apt.risk.data_assessment.rst b/docs/source/apt.risk.data_assessment.rst new file mode 100644 index 0000000..88c345c --- /dev/null +++ b/docs/source/apt.risk.data_assessment.rst @@ -0,0 +1,61 @@ +apt.risk.data\_assessment package +================================= + +Submodules +---------- + +apt.risk.data\_assessment.attack\_strategy\_utils module +-------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.attack_strategy_utils + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_assessment\_manager module +------------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.dataset_assessment_manager + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_attack module +------------------------------------------------ + +.. automodule:: apt.risk.data_assessment.dataset_attack + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_attack\_membership\_knn\_probabilities module +-------------------------------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.dataset_attack_membership_knn_probabilities + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_attack\_result module +-------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.dataset_attack_result + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_attack\_whole\_dataset\_knn\_distance module +------------------------------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: apt.risk.data_assessment + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apt.risk.rst b/docs/source/apt.risk.rst new file mode 100644 index 0000000..565b3ed --- /dev/null +++ b/docs/source/apt.risk.rst @@ -0,0 +1,18 @@ +apt.risk package +================ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + apt.risk.data_assessment + +Module contents +--------------- + +.. automodule:: apt.risk + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apt.rst b/docs/source/apt.rst index ebbf65f..b77eada 100644 --- a/docs/source/apt.rst +++ b/docs/source/apt.rst @@ -9,6 +9,7 @@ Subpackages apt.anonymization apt.minimization + apt.risk apt.utils Module contents diff --git a/docs/source/apt.utils.datasets.rst b/docs/source/apt.utils.datasets.rst index f40bbdf..b35ee0d 100644 --- a/docs/source/apt.utils.datasets.rst +++ b/docs/source/apt.utils.datasets.rst @@ -12,7 +12,6 @@ apt.utils.datasets.datasets module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/apt.utils.models.rst b/docs/source/apt.utils.models.rst index de4a5b1..3caa93c 100644 --- a/docs/source/apt.utils.models.rst +++ b/docs/source/apt.utils.models.rst @@ -4,6 +4,14 @@ apt.utils.models package Submodules ---------- +apt.utils.models.keras\_model module +------------------------------------ + +.. automodule:: apt.utils.models.keras_model + :members: + :undoc-members: + :show-inheritance: + apt.utils.models.model module ----------------------------- @@ -12,6 +20,14 @@ apt.utils.models.model module :undoc-members: :show-inheritance: +apt.utils.models.pytorch\_model module +-------------------------------------- + +.. automodule:: apt.utils.models.pytorch_model + :members: + :undoc-members: + :show-inheritance: + apt.utils.models.sklearn\_model module -------------------------------------- @@ -20,6 +36,13 @@ apt.utils.models.sklearn\_model module :undoc-members: :show-inheritance: +apt.utils.models.xgboost\_model module +-------------------------------------- + +.. automodule:: apt.utils.models.xgboost_model + :members: + :undoc-members: + :show-inheritance: Module contents --------------- diff --git a/docs/source/apt.utils.rst b/docs/source/apt.utils.rst index 4a6ce11..4ae24d2 100644 --- a/docs/source/apt.utils.rst +++ b/docs/source/apt.utils.rst @@ -21,7 +21,6 @@ apt.utils.dataset\_utils module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/requirements.txt b/requirements.txt index 4110441..2421067 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,6 @@ sortedcontainers==2.4.0 notebook jupyter ipywidgets + +#doc +sphinx_rtd_theme diff --git a/setup.cfg b/setup.cfg index 6820c91..77e9de1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [metadata] # replace with your username: name = ai-privacy-toolkit -version = 0.1.0 +version = 0.2.0 author = Abigail Goldsteen author_email = abigailt@il.ibm.com description = A toolkit for tools and techniques related to the privacy and compliance of AI models. diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index f3f7fa7..bd2f422 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -10,6 +10,7 @@ from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder +import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Input @@ -19,6 +20,8 @@ from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, g from apt.utils.datasets import ArrayDataset from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor, KerasClassifier +tf.compat.v1.disable_eager_execution() + @pytest.fixture def data(): diff --git a/tests/test_model.py b/tests/test_model.py index 8f7ee0d..b8fb8f1 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -10,14 +10,16 @@ from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier +import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Input from art.utils import check_and_transform_label_format - from art.utils import to_categorical +tf.compat.v1.disable_eager_execution() + def test_sklearn_classifier(): (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()