From c0bc2c6983a106f3f15bdffc0a6fe81c3fd9b50f Mon Sep 17 00:00:00 2001 From: abigailt Date: Thu, 23 Feb 2023 10:59:51 +0200 Subject: [PATCH 01/11] Update requirements Signed-off-by: abigailt --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index fc006d1..4a8c0a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -numpy==1.22.0 -pandas~=1.1.0 +numpy==1.24.2 +pandas==1.1.05 scipy==1.4.1 scikit-learn>=0.22.2 torch>=1.8.0 From a4e07faf0185c25c27050451df61c4630ed8a726 Mon Sep 17 00:00:00 2001 From: Maya Anderson Date: Mon, 20 Mar 2023 14:14:09 +0200 Subject: [PATCH 02/11] Merge pull request #71 from IBM/dataset_assessment Add AI privacy Dataset assessment module with two attack implementations. Signed-off-by: Maya Anderson --- .gitignore | 7 +- README.md | 5 + apt/risk/data_assessment/README.md | 105 +++++++++++ apt/risk/data_assessment/__init__.py | 12 ++ .../data_assessment/attack_strategy_utils.py | 70 +++++++ .../dataset_assessment_manager.py | 80 ++++++++ apt/risk/data_assessment/dataset_attack.py | 113 ++++++++++++ ...set_attack_membership_knn_probabilities.py | 160 ++++++++++++++++ .../data_assessment/dataset_attack_result.py | 24 +++ ...taset_attack_whole_dataset_knn_distance.py | 127 +++++++++++++ requirements.txt | 2 + tests/test_data_assessment.py | 173 ++++++++++++++++++ tests/test_data_assessment_short_test.py | 109 +++++++++++ 13 files changed, 986 insertions(+), 1 deletion(-) create mode 100644 apt/risk/data_assessment/README.md create mode 100644 apt/risk/data_assessment/__init__.py create mode 100644 apt/risk/data_assessment/attack_strategy_utils.py create mode 100644 apt/risk/data_assessment/dataset_assessment_manager.py create mode 100644 apt/risk/data_assessment/dataset_attack.py create mode 100644 apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py create mode 100644 apt/risk/data_assessment/dataset_attack_result.py create mode 100644 apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py create mode 100644 tests/test_data_assessment.py create mode 100644 tests/test_data_assessment_short_test.py diff --git a/.gitignore b/.gitignore index b0b6f3a..9fdd7a9 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,10 @@ coverage.xml .pytest_cache/ cover/ +# Test results +*.csv +*.png + # Translations *.mo *.pot @@ -157,4 +161,5 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ \ No newline at end of file +.idea/ + diff --git a/README.md b/README.md index 7f5d498..43487f6 100644 --- a/README.md +++ b/README.md @@ -38,3 +38,8 @@ A Python library for Machine Learning Security. Includes an attack module called (membership inference, attribute inference, model inversion and database reconstruction) as well as a *privacy* metrics module that contains membership leakage metrics for ML models. + +Citation +-------- +Abigail Goldsteen, Ola Saadi, Ron Shmelkin, Shlomit Shachor, Natalia Razinkov, +"AI privacy toolkit", SoftwareX, Volume 22, 2023, 101352, ISSN 2352-7110, https://doi.org/10.1016/j.softx.2023.101352. \ No newline at end of file diff --git a/apt/risk/data_assessment/README.md b/apt/risk/data_assessment/README.md new file mode 100644 index 0000000..3943be7 --- /dev/null +++ b/apt/risk/data_assessment/README.md @@ -0,0 +1,105 @@ +# Privacy Assessment of Datasets for AI Models + +This module implements a tool for privacy assessment of synthetic datasets that are to be used in AI model training. + +The main interface, ``DatasetAttack``, with the ``assess_privacy()`` main method assumes the availability of the +training data, holdout data and synthetic data at the time of the privacy evaluation. +It is to be implemented by concrete assessment methods, which can run the assessment on a per-record level, +or on the whole dataset. +The method ``assess_privacy()`` returns a ``DatasetAttackScore``, which contains a ``risk_score`` and, +optionally, a ``DatasetAttackResult``. Each specific attack can implement its own ``DatasetAttackScore``, which would +contain additional fields. + +The abstract class ``DatasetAttackMembership`` implements the ``DatasetAttack`` interface, but adds the result +of the membership inference attack, so that the final score contains both the membership inference attack result +for further analysis and the calculated score. + + +``DatasetAssessmentManager`` provides convenience methods to run multiple attacks and persist the result reports. + +Attack Implementations +----------------------- + +One implementation is based on the paper "GAN-Leaks: A Taxonomy of Membership Inference Attacks against Generative +Models"[^1] and its implementation[^2]. It is based on Black-Box MIA attack using +distances of members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset. +By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided in +configuration instead. +The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk score. + +Another implementation is based on the papers "Data Synthesis based on Generative Adversarial Networks"[^3] and +"Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data"[^4], and on a variation of its reference +implementation[^5]. +It is based on distances of synthetic data records from members (training set) and non-members (holdout set). +The privacy risk score is the share of synthetic records closer to the training than the holdout dataset. +By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided in +configuration instead. + +Usage +----- +An implementation of the ``DatasetAttack`` interface is used for performing a privacy attack for risk assessment of +synthetic datasets to be used in AI model training. +The original data members (training data), non-members (the holdout data) and the synthetic data created from the +original members should be available. +For reliability, all the datasets should be preprocessed and normalized. + +The following example runs all the attacks and persists the results in files, using ``DatasetAssessmentManager``. +It assumes that you provide it with the pairs ``(x_train, y_train)``, ``(x_test, y_test)`` and ``(x_synth, y_synth)`` +for members, non-members and the synthetic datasets, respectively. + +```python +from apt.risk.data_assessment.dataset_assessment_manager import DatasetAssessmentManager, \ + DatasetAssessmentManagerConfig +from apt.utils.datasets import ArrayDataset + +dataset_assessment_manager = DatasetAssessmentManager( + DatasetAssessmentManagerConfig(persist_reports=True, generate_plots=False)) + +synthetic_data = ArrayDataset(x_synth, y_synth) +original_data_members = ArrayDataset(x_train, y_train) +original_data_non_members = ArrayDataset(x_test, y_test) + +dataset_name = 'my_dataset' +[score_gl, score_h] = dataset_assessment_manager.assess( + original_data_members, original_data_non_members, synthetic_data, dataset_name) +dataset_assessment_manager.dump_all_scores_to_files() +``` + +Alternatively, each attack can be run separately, for instance: + +```python +from apt.risk.data_assessment.dataset_attack_membership_knn_probabilities import \ + DatasetAttackConfigMembershipKnnProbabilities, DatasetAttackMembershipKnnProbabilities +from apt.utils.datasets import ArrayDataset + +synthetic_data = ArrayDataset(x_synth, y_synth) +original_data_members = ArrayDataset(x_train, y_train) +original_data_non_members = ArrayDataset(x_test, y_test) + +config_gl = DatasetAttackConfigMembershipKnnProbabilities(use_batches=False, + generate_plot=False) +attack_gl = DatasetAttackMembershipKnnProbabilities(original_data_members, + original_data_non_members, + synthetic_data, + config_gl) + +score_gl = attack_gl.assess_privacy() +``` + +Citations +--------- + + [^1]: "GAN-Leaks: A Taxonomy of Membership Inference Attacks against Generative Models" by D. Chen, N. Yu, Y. Zhang, + M. Fritz in Proceedings of the 2020 ACM SIGSAC Conference on Computer and Communications Security, 343–62, 2020. + [https://doi.org/10.1145/3372297.3417238](https://doi.org/10.1145/3372297.3417238) + + [^2]: Code for the paper "GAN-Leaks: A Taxonomy of Membership Inference Attacks against Generative Models" + [https://github.com/DingfanChen/GAN-Leaks](https://github.com/DingfanChen/GAN-Leaks) + + [^3]: "Data Synthesis based on Generative Adversarial Networks." by N. Park, M. Mohammadi, K. Gorde, S. Jajodia, + H. Park, and Y. Kim in International Conference on Very Large Data Bases (VLDB), 2018. + + [^4]: "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer. + + [^5]: Code for the paper "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" + [https://github.com/mostly-ai/paper-fidelity-accuracy](https://github.com/mostly-ai/paper-fidelity-accuracy) diff --git a/apt/risk/data_assessment/__init__.py b/apt/risk/data_assessment/__init__.py new file mode 100644 index 0000000..8731743 --- /dev/null +++ b/apt/risk/data_assessment/__init__.py @@ -0,0 +1,12 @@ +""" +Module providing privacy risk assessment for synthetic data. + +The main interface, ``DatasetAttack``, with the ``assess_privacy()`` main method assumes the availability of the +training data, holdout data and synthetic data at the time of the privacy evaluation. +It is to be implemented by concrete assessment methods, which can run the assessment on a per-record level, +or on the whole dataset. +The abstract class ``DatasetAttackMembership`` implements the ``DatasetAttack`` interface, but adds the result +of the membership inference attack, so that the final score contains both the membership inference attack result +for further analysis and the calculated score. +""" +from apt.risk.data_assessment import dataset_attack diff --git a/apt/risk/data_assessment/attack_strategy_utils.py b/apt/risk/data_assessment/attack_strategy_utils.py new file mode 100644 index 0000000..674feff --- /dev/null +++ b/apt/risk/data_assessment/attack_strategy_utils.py @@ -0,0 +1,70 @@ +import abc + +import numpy as np +from sklearn.neighbors import NearestNeighbors +from tqdm import tqdm + +from apt.utils.datasets import ArrayDataset + + +class AttackStrategyUtils(abc.ABC): + """ + Abstract base class for common utilities of various privacy attack strategies. + """ + pass + + +class KNNAttackStrategyUtils(AttackStrategyUtils): + """ + Common utilities for attack strategy based on KNN distances. + """ + + def __init__(self, use_batches: bool = False, batch_size: int = 10) -> None: + """ + :param use_batches: Use batches with a progress meter or not when finding KNNs for query set + :param batch_size: if use_batches=True, the size of batch_size should be > 0 + """ + self.use_batches = use_batches + self.batch_size = batch_size + if use_batches: + if batch_size < 1: + raise ValueError(f"When using batching batch_size should be > 0, and not {batch_size}") + + def fit(self, knn_learner: NearestNeighbors, dataset: ArrayDataset): + knn_learner.fit(dataset.get_samples()) + + def find_knn(self, knn_learner: NearestNeighbors, query_samples: ArrayDataset, distance_processor=None): + """ + Nearest neighbor search function. + :param query_samples: query samples, to which nearest neighbors are to be found + :param knn_learner: unsupervised learner for implementing neighbor searches, after it was fitted + :param distance_processor: function for processing the distance into another more relevant metric per sample. + Its input is an array representing distances (the distances returned by NearestNeighbors.kneighbors() ), and + the output should be another array with distance-based values that enable to compute the final risk score + :return: + distances of the query samples to their nearest neighbors, or a metric based on that distance and calculated + by the distance_processor function + """ + samples = query_samples.get_samples() + if not self.use_batches: + distances, _ = knn_learner.kneighbors(samples, return_distance=True) + if distance_processor: + return distance_processor(distances) + else: + return distances + + distances = [] + for i in tqdm(range(len(samples) // self.batch_size)): + x_batch = samples[i * self.batch_size:(i + 1) * self.batch_size] + x_batch = np.reshape(x_batch, [self.batch_size, -1]) + + # dist_batch: distance between every query sample in batch to its KNNs among training samples + dist_batch, _ = knn_learner.kneighbors(x_batch, return_distance=True) + + # The probability of each sample to be generated + if distance_processor: + distance_based_metric_per_sample_batch = distance_processor(dist_batch) + distances.append(distance_based_metric_per_sample_batch) + else: + distances.append(dist_batch) + return np.concatenate(distances) diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py new file mode 100644 index 0000000..78beeef --- /dev/null +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +import pandas as pd + +from apt.risk.data_assessment.dataset_attack_membership_knn_probabilities import \ + DatasetAttackConfigMembershipKnnProbabilities, DatasetAttackMembershipKnnProbabilities +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME +from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \ + DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance +from apt.utils.datasets import ArrayDataset + + +@dataclass +class DatasetAssessmentManagerConfig: + persist_reports: bool = False + generate_plots: bool = False + + +class DatasetAssessmentManager: + """ + The main class for running dataset assessment attacks. + """ + attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = [] + attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = [] + + def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None: + """ + :param config: Configuration parameters to guide the dataset assessment process + """ + self.config = config + + def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, + synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME) -> list[DatasetAttackScore]: + """ + Do dataset privacy risk assessment by running dataset attacks, and return their scores. + + :param original_data_members: A container for the training original samples and labels, + only samples are used in the assessment + :param original_data_non_members: A container for the holdout original samples and labels, + only samples are used in the assessment + :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment + :param dataset_name: A name to identify this dataset, optional + + :return: + a list of dataset attack risk scores + """ + config_gl = DatasetAttackConfigMembershipKnnProbabilities(use_batches=False, + generate_plot=self.config.generate_plots) + attack_gl = DatasetAttackMembershipKnnProbabilities(original_data_members, + original_data_non_members, + synthetic_data, + config_gl, + dataset_name) + + score_gl = attack_gl.assess_privacy() + self.attack_scores_per_record_knn_probabilities.append(score_gl) + + config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False) + attack_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members, + synthetic_data, config_h, dataset_name) + + score_h = attack_h.assess_privacy() + self.attack_scores_whole_dataset_knn_distance.append(score_h) + return [score_gl, score_h] + + def dump_all_scores_to_files(self): + if self.config.persist_reports: + results_log_file = "_results.log.csv" + self.dump_scores_to_file(self.attack_scores_per_record_knn_probabilities, + "per_record_knn_probabilities" + results_log_file, True) + self.dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance, + "whole_dataset_knn_distance" + results_log_file, True) + + @staticmethod + def dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool): + run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore') # don't serialize result + run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w') # Overwrite diff --git a/apt/risk/data_assessment/dataset_attack.py b/apt/risk/data_assessment/dataset_attack.py new file mode 100644 index 0000000..4cac42d --- /dev/null +++ b/apt/risk/data_assessment/dataset_attack.py @@ -0,0 +1,113 @@ +""" +This module defines the interface for privacy risk assessment of synthetic datasets. +""" +import abc +from typing import Optional + +import matplotlib.pyplot as plt +import numpy as np +from sklearn import metrics +from sklearn.metrics import RocCurveDisplay + +from apt.risk.data_assessment.attack_strategy_utils import AttackStrategyUtils +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultMembership +from apt.utils.datasets import ArrayDataset + + +class Config(abc.ABC): + """ + The base class for dataset attack configurations + """ + pass + + +class DatasetAttack(abc.ABC): + """ + The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model + training. The original data members (training data) and non-members (the holdout data) should be available. + For reliability, all the datasets should be preprocessed and normalized. + """ + + def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, + synthetic_data: ArrayDataset, config: Config, dataset_name: str, + attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None: + """ + :param original_data_members: A container for the training original samples and labels, + only samples are used in the assessment + :param original_data_non_members: A container for the holdout original samples and labels, + only samples are used in the assessment + :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment + :param config: Configuration parameters to guide the assessment process + :param dataset_name: A name to identify the dataset under attack, optional + :param attack_strategy_utils: Utils for use with the attack strategy, optional + """ + + self.original_data_members = original_data_members + self.original_data_non_members = original_data_non_members + self.synthetic_data = synthetic_data + self.config = config + self.attack_strategy_utils = attack_strategy_utils + self.dataset_name = dataset_name + + @abc.abstractmethod + def assess_privacy(self) -> DatasetAttackScore: + """ + Assess the privacy of the dataset + :return: + score: DatasetAttackScore the privacy attack risk score + """ + pass + + +class DatasetAttackMembership(DatasetAttack): + """ + An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level. + """ + + @abc.abstractmethod + def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership, + generate_plot: bool = False) -> DatasetAttackScore: + """ + Calculate dataset privacy score based on the result of the privacy attack + :return: + score: DatasetAttackScore + """ + pass + + @staticmethod + def plot_roc_curve(dataset_name: str, member_probabilities: np.ndarray, non_member_probabilities: np.ndarray, + filename_prefix: str = ""): + """ + Plot ROC curve + :param dataset_name: dataset name, will become part of the plot filename + :param member_probabilities: probability estimates of the member samples, the training data + :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data + :param filename_prefix: name prefix for the ROC curve plot + """ + labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities),)))) + results = np.concatenate((non_member_probabilities, member_probabilities)) + svc_disp = RocCurveDisplay.from_predictions(labels, results) + svc_disp.plot() + plt.plot([0, 1], [0, 1], color="navy", linewidth=2, linestyle="--", label='No skills') + plt.title('ROC curve') + plt.savefig(f'{filename_prefix}{dataset_name}_roc_curve.png') + + @staticmethod + def calculate_metrics(member_probabilities: np.ndarray, non_member_probabilities: np.ndarray): + """ + Calculate attack performance metrics + :param member_probabilities: probability estimates of the member samples, the training data + :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data + :return: + fpr: False Positive rate + tpr: True Positive rate + threshold: threshold + auc: area under the Receiver Operating Characteristic Curve + ap: average precision score + """ + labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities))))) + results = np.concatenate((non_member_probabilities, member_probabilities)) + fpr, tpr, threshold = metrics.roc_curve(labels, results, pos_label=1) + auc = metrics.roc_auc_score(labels, results) + ap = metrics.average_precision_score(labels, results) + return fpr, tpr, threshold, auc, ap diff --git a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py new file mode 100644 index 0000000..7779b17 --- /dev/null +++ b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py @@ -0,0 +1,160 @@ +""" +This module implements privacy risk assessment of synthetic datasets based on the paper: +"GAN-Leaks: A Taxonomy of Membership Inference Attacks against Generative Models" by D. Chen, N. Yu, Y. Zhang, M. Fritz +published in Proceedings of the 2020 ACM SIGSAC Conference on Computer and Communications Security, 343–62, 2020. +https://doi.org/10.1145/3372297.3417238 and its implementation in https://github.com/DingfanChen/GAN-Leaks. +""" +from dataclasses import dataclass +from typing import Callable + +import numpy as np +from sklearn.neighbors import NearestNeighbors + +from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils +from apt.risk.data_assessment.dataset_attack import DatasetAttackMembership, Config +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DatasetAttackResultMembership, \ + DEFAULT_DATASET_NAME +from apt.utils.datasets import ArrayDataset + + +@dataclass +class DatasetAttackConfigMembershipKnnProbabilities(Config): + """Configuration for DatasetAttackMembershipKnnProbabilities. + + Attributes: + k: Number of nearest neighbors to search + use_batches: Divide query samples into batches or not. + batch_size: Query sample batch size. + compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return + one value indicating the distance between those vectors. + See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. + distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in + sklearn.neighbors.NearestNeighbors documentation. + generate_plot: Generate or not an AUR ROC curve and persist it in a file + """ + k: int = 5 + use_batches: bool = False + batch_size: int = 10 + compute_distance: Callable = None + distance_params: dict = None + generate_plot: bool = False + + +@dataclass +class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore): + """DatasetAttackMembershipKnnProbabilities privacy risk score. + """ + roc_auc_score: float + average_precision_score: float + assessment_type: str = 'MembershipKnnProbabilities' # to be used in reports + + def __init__(self, dataset_name: str, roc_auc_score: float, average_precision_score: float, + result: DatasetAttackResultMembership) -> None: + """ + dataset_name: dataset name to be used in reports + roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack + performance. + average_precision_score: the proportion of predicted members that are correctly members + result: the result of the membership inference attack + """ + super().__init__(dataset_name=dataset_name, risk_score=roc_auc_score, result=result) + self.roc_auc_score = roc_auc_score + self.average_precision_score = average_precision_score + + +class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): + """ + Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of + members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset. + By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided + in configuration instead. + The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure. + """ + + def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, + synthetic_data: ArrayDataset, + config: DatasetAttackConfigMembershipKnnProbabilities = DatasetAttackConfigMembershipKnnProbabilities(), + dataset_name: str = DEFAULT_DATASET_NAME): + """ + :param original_data_members: A container for the training original samples and labels + :param original_data_non_members: A container for the holdout original samples and labels + :param synthetic_data: A container for the synthetic samples and labels + :param config: Configuration parameters to guide the attack, optional + :param dataset_name: A name to identify this dataset, optional + """ + attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) + super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name, + attack_strategy_utils) + if config.compute_distance: + self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto', metric=config.compute_distance, + metric_params=config.distance_params) + else: + self.knn_learner = NearestNeighbors(n_neighbors=config.k, algorithm='auto') + + def assess_privacy(self) -> DatasetAttackScoreMembershipKnnProbabilities: + """ + Membership Inference Attack which calculates probabilities of member and non-member samples to be generated by + the synthetic data generator. + The assumption is that since the generative model is trained to approximate the training data distribution + then the probability of a sample to be a member of the training data should be proportional to the probability + that the query sample can be generated by the generative model. + So, if the probability that the query sample is generated by the generative model is large, + it is more likely that the query sample was used to train the generative model. This probability is approximated + by the Parzen window density estimation in ``probability_per_sample()``, computed from the NN distances from the + query samples to the synthetic data samples. + + :return: + Privacy score of the attack together with the attack result with the probabilities of member and + non-member samples to be generated by the synthetic data generator based on the NN distances from the + query samples to the synthetic data samples + """ + # nearest neighbor search + self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data) + + # members query + member_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_members, + self.probability_per_sample) + + # non-members query + non_member_proba = self.attack_strategy_utils.find_knn(self.knn_learner, self.original_data_non_members, + self.probability_per_sample) + + result = DatasetAttackResultMembership(member_probabilities=member_proba, + non_member_probabilities=non_member_proba) + + score = self.calculate_privacy_score(result, self.config.generate_plot) + return score + + def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership, + generate_plot: bool = False) -> DatasetAttackScoreMembershipKnnProbabilities: + """ + Evaluate privacy score from the probabilities of member and non-member samples to be generated by the synthetic + data generator. The probabilities are computed by the ``assess_privacy()`` method. + :param dataset_attack_result attack result containing probabilities of member and non-member samples to be + generated by the synthetic data generator + :param generate_plot generate AUC ROC curve plot and persist it + :return: + score of the attack, based on distance-based probabilities - mainly the ROC AUC score + """ + member_proba, non_member_proba = \ + dataset_attack_result.member_probabilities, dataset_attack_result.non_member_probabilities + fpr, tpr, threshold, auc, ap = self.calculate_metrics(member_proba, non_member_proba) + score = DatasetAttackScoreMembershipKnnProbabilities(self.dataset_name, + result=dataset_attack_result, + roc_auc_score=auc, average_precision_score=ap) + if generate_plot: + self.plot_roc_curve(self.dataset_name, member_proba, non_member_proba) + return score + + @staticmethod + def probability_per_sample(distances: np.ndarray): + """ + For every sample represented by its distance from the query sample to its KNN in synthetic data, + computes the probability of the synthetic data to be part of the query dataset. + :param distances: distance between every query sample in batch to its KNNs among synthetic samples, a numpy + array of size (n, k) with n being the number of samples, k - the number of KNNs + :return: + probability estimates of the query samples being generated and so - of being part of the synthetic set, a + numpy array of size (n,) + """ + return np.average(np.exp(-distances), axis=1) diff --git a/apt/risk/data_assessment/dataset_attack_result.py b/apt/risk/data_assessment/dataset_attack_result.py new file mode 100644 index 0000000..0ed0bd4 --- /dev/null +++ b/apt/risk/data_assessment/dataset_attack_result.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass +from typing import Optional + +import numpy as np + +DEFAULT_DATASET_NAME = "dataset" + + +@dataclass +class DatasetAttackResult: + pass + + +@dataclass +class DatasetAttackScore: + dataset_name: str + risk_score: float + result: Optional[DatasetAttackResult] + + +@dataclass +class DatasetAttackResultMembership(DatasetAttackResult): + member_probabilities: np.ndarray + non_member_probabilities: np.ndarray diff --git a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py new file mode 100644 index 0000000..1a57bbd --- /dev/null +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -0,0 +1,127 @@ +""" +This module implements privacy risk assessment of synthetic datasets based on the papers +"Data Synthesis based on Generative Adversarial Networks." by N. Park, M. Mohammadi, K. Gorde, S. Jajodia, H. Park, +and Y. Kim in International Conference on Very Large Data Bases (VLDB), 2018. +and "Holdout-Based Fidelity and Privacy Assessment of Mixed-Type Synthetic Data" by M. Platzer and T. Reutterer. +and on a variation of its reference implementation in https://github.com/mostly-ai/paper-fidelity-accuracy. +""" +from dataclasses import dataclass + +import numpy as np +from sklearn.neighbors import NearestNeighbors + +from apt.risk.data_assessment.attack_strategy_utils import KNNAttackStrategyUtils +from apt.risk.data_assessment.dataset_attack import Config, DatasetAttack +from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, DEFAULT_DATASET_NAME +from apt.utils.datasets import ArrayDataset + +K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest neighbor. + + +@dataclass +class DatasetAttackConfigWholeDatasetKnnDistance(Config): + """Configuration for DatasetAttackWholeDatasetKnnDistance. + + Attributes: + use_batches: Divide query samples into batches or not. + batch_size: Query sample batch size. + compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return + one value indicating the distance between those vectors. + See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. + distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in + sklearn.neighbors.NearestNeighbors documentation. + """ + use_batches: bool = False + batch_size: int = 10 + compute_distance: callable = None + distance_params: dict = None + + +@dataclass +class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore): + """DatasetAttackWholeDatasetKnnDistance privacy risk score. + """ + share: float + assessment_type: str = 'WholeDatasetKnnDistance' # to be used in reports + + def __init__(self, dataset_name: str, share: float) -> None: + """ + dataset_name: dataset name to be used in reports + share : the share of synthetic records closer to the training than the holdout dataset. + A value of 0.5 or close to it means good privacy. + """ + super().__init__(dataset_name=dataset_name, risk_score=share, result=None) + self.share = share + + +class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): + """ + Privacy risk assessment for synthetic datasets based on distances of synthetic data records from + members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic + records closer to the training than the holdout dataset. + By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in + configuration instead. + """ + + def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, + synthetic_data: ArrayDataset, + config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(), + dataset_name: str = DEFAULT_DATASET_NAME): + """ + :param original_data_members: A container for the training original samples and labels + :param original_data_non_members: A container for the holdout original samples and labels + :param synthetic_data: A container for the synthetic samples and labels + :param config: Configuration parameters to guide the assessment process, optional + :param dataset_name: A name to identify this dataset, optional + """ + attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) + super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name, + attack_strategy_utils) + if config.compute_distance: + self.knn_learner_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance, + metric_params=config.distance_params) + self.knn_learner_non_members = NearestNeighbors(n_neighbors=K, metric=config.compute_distance, + metric_params=config.distance_params) + else: + self.knn_learner_members = NearestNeighbors(n_neighbors=K) + self.knn_learner_non_members = NearestNeighbors(n_neighbors=K) + + def assess_privacy(self) -> DatasetAttackScoreWholeDatasetKnnDistance: + """ + Calculate the share of synthetic records closer to the training than the holdout dataset, based on the + DCR computed by 'calculate_distances()'. + :return: + score of the attack, based on the NN distances from the query samples to the synthetic data samples + """ + member_distances, non_member_distances = self.calculate_distances() + # distance of the synth. records to members and to non-members + assert (len(member_distances) == len(non_member_distances)) + n_members = len(self.original_data_members.get_samples()) + n_non_members = len(self.original_data_non_members.get_samples()) + + # percent of synth. records closer to members, + # and distance ties are divided equally between members and non-members + share = np.mean(member_distances < non_member_distances) + (n_members / (n_members + n_non_members)) * np.mean( + member_distances == non_member_distances) + score = DatasetAttackScoreWholeDatasetKnnDistance(self.dataset_name, share=share) + return score + + def calculate_distances(self): + """ + Calculate member and non-member query probabilities, based on their distance to their KNN among + synthetic samples. This distance is called distance to the closest record (DCR), as defined by + N. Park et. al. in "Data Synthesis based on Generative Adversarial Networks." + + :return: + member_distances - distances of each synthetic data member from its nearest training sample + non_member_distances - distances of each synthetic data member from its nearest validation sample + """ + # nearest neighbor search + self.attack_strategy_utils.fit(self.knn_learner_members, self.original_data_members) + self.attack_strategy_utils.fit(self.knn_learner_non_members, self.original_data_non_members) + + # distances of the synthetic data from the member and non-member samples + member_distances = self.attack_strategy_utils.find_knn(self.knn_learner_members, self.synthetic_data) + non_member_distances = self.attack_strategy_utils.find_knn(self.knn_learner_non_members, self.synthetic_data) + + return member_distances, non_member_distances diff --git a/requirements.txt b/requirements.txt index 4a8c0a6..b6f5d56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,8 @@ pandas==1.1.05 scipy==1.4.1 scikit-learn>=0.22.2 torch>=1.8.0 +tqdm>=4.64.1 +matplotlib>=3.7.0 adversarial-robustness-toolbox>=1.11.0 # testing diff --git a/tests/test_data_assessment.py b/tests/test_data_assessment.py new file mode 100644 index 0000000..b83a382 --- /dev/null +++ b/tests/test_data_assessment.py @@ -0,0 +1,173 @@ +import numpy as np +import pytest +from sklearn.compose import ColumnTransformer +from sklearn.decomposition import PCA +from sklearn.impute import SimpleImputer +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KernelDensity +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +from apt.anonymization import Anonymize +from apt.risk.data_assessment.dataset_assessment_manager import DatasetAssessmentManager, DatasetAssessmentManagerConfig +from apt.utils.dataset_utils import get_iris_dataset_np, get_diabetes_dataset_np, get_adult_dataset_pd, \ + get_nursery_dataset_pd +from apt.utils.datasets import ArrayDataset + +MIN_SHARE = 0.5 +MIN_ROC_AUC = 0.0 +MIN_PRECISION = 0.0 + +NUM_SYNTH_SAMPLES = 40000 +NUM_SYNTH_COMPONENTS = 4 + +iris_dataset_np = get_iris_dataset_np() +diabetes_dataset_np = get_diabetes_dataset_np() +nursery_dataset_pd = get_nursery_dataset_pd() +adult_dataset_pd = get_adult_dataset_pd() + +mgr = DatasetAssessmentManager(DatasetAssessmentManagerConfig(persist_reports=False, generate_plots=False)) + + +def teardown_function(): + mgr.dump_all_scores_to_files() + + +anon_testdata = [('iris_np', iris_dataset_np, 'np', k, mgr) for k in range(2, 10, 4)] \ + + [('diabetes_np', diabetes_dataset_np, 'np', k, mgr) for k in range(2, 10, 4)] \ + + [('nursery_pd', nursery_dataset_pd, 'pd', k, mgr) for k in range(2, 10, 4)] \ + + [('adult_pd', adult_dataset_pd, 'pd', k, mgr) for k in range(2, 10, 4)] + + +@pytest.mark.parametrize("name, data, dataset_type, k, mgr", anon_testdata) +def test_risk_anonymization(name, data, dataset_type, k, mgr): + (x_train, y_train), (x_test, y_test) = data + + if dataset_type == 'np': + # no need to preprocess + preprocessed_x_train = x_train + preprocessed_x_test = x_test + QI = [0, 2] + anonymizer = Anonymize(k, QI, train_only_QI=True) + elif "adult" in name: + preprocessed_x_train, preprocessed_x_test = preprocess_adult_x_data(x_train, x_test) + QI = list(range(15, 27)) + anonymizer = Anonymize(k, QI) + elif "nursery" in name: + preprocessed_x_train, preprocessed_x_test = preprocess_nursery_x_data(x_train, x_test) + QI = list(range(15, 27)) + anonymizer = Anonymize(k, QI, train_only_QI=True) + else: + raise ValueError('Pandas dataset missing a preprocessing step') + + anonymized_data = ArrayDataset(anonymizer.anonymize(ArrayDataset(preprocessed_x_train, y_train))) + original_data_members = ArrayDataset(preprocessed_x_train, y_train) + original_data_non_members = ArrayDataset(preprocessed_x_test, y_test) + + dataset_name = f'anon_k{k}_{name}' + assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, anonymized_data, + dataset_name) + + +testdata = [('iris_np', iris_dataset_np, 'np', mgr), + ('diabetes_np', diabetes_dataset_np, 'np', mgr), + ('nursery_pd', nursery_dataset_pd, 'pd', mgr), + ('adult_pd', adult_dataset_pd, 'pd', mgr)] + + +@pytest.mark.parametrize("name, data, dataset_type, mgr", testdata) +def test_risk_kde(name, data, dataset_type, mgr): + (x_train, y_train), (x_test, y_test) = data + + if dataset_type == 'np': + encoded = x_train + encoded_test = x_test + num_synth_components = NUM_SYNTH_COMPONENTS + elif "adult" in name: + encoded, encoded_test = preprocess_adult_x_data(x_train, x_test) + num_synth_components = 10 + elif "nursery" in name: + encoded, encoded_test = preprocess_nursery_x_data(x_train, x_test) + num_synth_components = 10 + else: + raise ValueError('Pandas dataset missing a preprocessing step') + + synth_data = ArrayDataset( + kde(NUM_SYNTH_SAMPLES, n_components=num_synth_components, original_data=encoded)) + original_data_members = ArrayDataset(encoded, y_train) + original_data_non_members = ArrayDataset(encoded_test, y_test) + + dataset_name = 'kde' + str(NUM_SYNTH_SAMPLES) + name + assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, synth_data, dataset_name) + + +def kde(n_samples, n_components, original_data): + """ + Simple synthetic data genrator: estimates the kernel density of data using a Gaussian kernel and then generates + samples from this distribution + """ + digit_data = original_data + pca = PCA(n_components=n_components, whiten=False) + data = pca.fit_transform(digit_data) + params = {'bandwidth': np.logspace(-1, 1, 20)} + grid = GridSearchCV(KernelDensity(), params, cv=5) + grid.fit(data) + + kde_estimator = grid.best_estimator_ + + new_data = kde_estimator.sample(n_samples, random_state=0) + new_data = pca.inverse_transform(new_data) + return new_data + + +def preprocess_adult_x_data(x_train, x_test): + features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'] + categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'native-country'] + # prepare data for DT + numeric_features = [f for f in features if f not in categorical_features] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(x_train) + encoded_test = preprocessor.fit_transform(x_test) + return encoded, encoded_test + + +def preprocess_nursery_x_data(x_train, x_test): + x_train = x_train.astype(str) + features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"] + # QI = ["finance", "social", "health"] + categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children'] + # prepare data for DT + numeric_features = [f for f in features if f not in categorical_features] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(x_train) + encoded_test = preprocessor.fit_transform(x_test) + return encoded, encoded_test + + +def assess_privacy_and_validate_result(dataset_assessment_manager, original_data_members, original_data_non_members, + synth_data, dataset_name): + [score_g, score_h] = dataset_assessment_manager.assess(original_data_members, original_data_non_members, synth_data, + dataset_name) + assert (score_g.roc_auc_score > MIN_ROC_AUC) + assert (score_g.average_precision_score > MIN_PRECISION) + assert (score_h.share > MIN_SHARE) diff --git a/tests/test_data_assessment_short_test.py b/tests/test_data_assessment_short_test.py new file mode 100644 index 0000000..1089d4c --- /dev/null +++ b/tests/test_data_assessment_short_test.py @@ -0,0 +1,109 @@ +import pytest + +from apt.anonymization import Anonymize +from apt.risk.data_assessment.dataset_assessment_manager import DatasetAssessmentManager, DatasetAssessmentManagerConfig +from apt.utils.dataset_utils import get_iris_dataset_np, get_nursery_dataset_pd +from apt.utils.datasets import ArrayDataset +from tests.test_data_assessment import kde, preprocess_nursery_x_data + +NUM_SYNTH_SAMPLES = 10 +NUM_SYNTH_COMPONENTS = 2 +ANON_K = 2 +MIN_SHARE = 0.5 +MIN_ROC_AUC = 0.0 +MIN_PRECISION = 0.0 + +iris_dataset_np = get_iris_dataset_np() +nursery_dataset_pd = get_nursery_dataset_pd() + +mgr1 = DatasetAssessmentManager(DatasetAssessmentManagerConfig(persist_reports=False, generate_plots=False)) +mgr2 = DatasetAssessmentManager(DatasetAssessmentManagerConfig(persist_reports=False, generate_plots=True)) +mgr3 = DatasetAssessmentManager(DatasetAssessmentManagerConfig(persist_reports=True, generate_plots=False)) +mgr4 = DatasetAssessmentManager(DatasetAssessmentManagerConfig(persist_reports=True, generate_plots=True)) +mgrs = [mgr1, mgr2, mgr3, mgr4] + + +def teardown_function(): + for mgr in mgrs: + mgr.dump_all_scores_to_files() + + +anon_testdata = [('iris_np', iris_dataset_np, 'np', mgr1)] \ + + [('nursery_pd', nursery_dataset_pd, 'pd', mgr2)] \ + + [('iris_np', iris_dataset_np, 'np', mgr3)] \ + + [('nursery_pd', nursery_dataset_pd, 'pd', mgr4)] + + +@pytest.mark.parametrize("name, data, dataset_type, mgr", anon_testdata) +def test_risk_anonymization(name, data, dataset_type, mgr): + (x_train, y_train), (x_test, y_test) = data + + if dataset_type == 'np': + # no need to preprocess + preprocessed_x_train = x_train + preprocessed_x_test = x_test + QI = [0, 2] + anonymizer = Anonymize(ANON_K, QI, train_only_QI=True) + elif "nursery" in name: + preprocessed_x_train, preprocessed_x_test = preprocess_nursery_x_data(x_train, x_test) + QI = list(range(15, 27)) + anonymizer = Anonymize(ANON_K, QI, train_only_QI=True) + else: + raise ValueError('Pandas dataset missing a preprocessing step') + + anonymized_data = ArrayDataset(anonymizer.anonymize(ArrayDataset(preprocessed_x_train, y_train))) + original_data_members = ArrayDataset(preprocessed_x_train, y_train) + original_data_non_members = ArrayDataset(preprocessed_x_test, y_test) + + dataset_name = f'anon_k{ANON_K}_{name}' + assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, anonymized_data, + dataset_name) + + assess_privacy_and_validate_result(mgr, original_data_members=original_data_members, + original_data_non_members=original_data_non_members, + synth_data=anonymized_data, dataset_name=None) + + +testdata = [('iris_np', iris_dataset_np, 'np', mgr4), + ('nursery_pd', nursery_dataset_pd, 'pd', mgr3), + ('iris_np', iris_dataset_np, 'np', mgr2), + ('nursery_pd', nursery_dataset_pd, 'pd', mgr1)] + + +@pytest.mark.parametrize("name, data, dataset_type, mgr", testdata) +def test_risk_kde(name, data, dataset_type, mgr): + (x_train, y_train), (x_test, y_test) = data + + if dataset_type == 'np': + encoded = x_train + encoded_test = x_test + num_synth_components = NUM_SYNTH_COMPONENTS + elif "nursery" in name: + encoded, encoded_test = preprocess_nursery_x_data(x_train, x_test) + num_synth_components = 10 + else: + raise ValueError('Pandas dataset missing a preprocessing step') + + synth_data = ArrayDataset( + kde(NUM_SYNTH_SAMPLES, n_components=num_synth_components, original_data=encoded)) + original_data_members = ArrayDataset(encoded, y_train) + original_data_non_members = ArrayDataset(encoded_test, y_test) + + dataset_name = 'kde' + str(NUM_SYNTH_SAMPLES) + name + assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, synth_data, dataset_name) + + assess_privacy_and_validate_result(mgr, original_data_members=original_data_members, + original_data_non_members=original_data_non_members, + synth_data=synth_data, dataset_name=None) + + +def assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, synth_data, + dataset_name): + if dataset_name: + [score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, synth_data, + dataset_name) + else: + [score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, synth_data) + assert (score_g.roc_auc_score > MIN_ROC_AUC) + assert (score_g.average_precision_score > MIN_PRECISION) + assert (score_h.share > MIN_SHARE) From 98a7a078bb3d507501645134ed6658f29f874661 Mon Sep 17 00:00:00 2001 From: andersonm-ibm <63074550+andersonm-ibm@users.noreply.github.com> Date: Thu, 4 May 2023 12:21:42 +0300 Subject: [PATCH 03/11] Add dataset privacy risk assessment example notebook. (#73) * Add dataset assessment notebook and reference to module from project README Signed-off-by: Maya Anderson --- README.md | 3 + apt/risk/data_assessment/dataset_attack.py | 1 - notebooks/dataset_assessment_nursery.ipynb | 402 +++++++++++++++++++++ requirements.txt | 5 + 4 files changed, 410 insertions(+), 1 deletion(-) create mode 100644 notebooks/dataset_assessment_nursery.ipynb diff --git a/README.md b/README.md index 43487f6..065d0e1 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,9 @@ minimization principle in GDPR for ML models. It enables to reduce the amount of personal data needed to perform predictions with a machine learning model, while still enabling the model to make accurate predictions. This is done by by removing or generalizing some of the input features. +The [**dataset assessment**](apt/risk/data_assessment/README.md) module implements a tool for privacy assessment of +synthetic datasets that are to be used in AI model training. + Official ai-privacy-toolkit documentation: https://ai-privacy-toolkit.readthedocs.io/en/latest/ Installation: pip install ai-privacy-toolkit diff --git a/apt/risk/data_assessment/dataset_attack.py b/apt/risk/data_assessment/dataset_attack.py index 4cac42d..e057c8a 100644 --- a/apt/risk/data_assessment/dataset_attack.py +++ b/apt/risk/data_assessment/dataset_attack.py @@ -87,7 +87,6 @@ class DatasetAttackMembership(DatasetAttack): labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities),)))) results = np.concatenate((non_member_probabilities, member_probabilities)) svc_disp = RocCurveDisplay.from_predictions(labels, results) - svc_disp.plot() plt.plot([0, 1], [0, 1], color="navy", linewidth=2, linestyle="--", label='No skills') plt.title('ROC curve') plt.savefig(f'{filename_prefix}{dataset_name}_roc_curve.png') diff --git a/notebooks/dataset_assessment_nursery.ipynb b/notebooks/dataset_assessment_nursery.ipynb new file mode 100644 index 0000000..dae2ce0 --- /dev/null +++ b/notebooks/dataset_assessment_nursery.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Using AI privacy dataset assessment" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "In this tutorial we will show how to perform privacy risk analysis of synthetic datasets for ML models using the dataset assessment module.\n", + "\n", + "This will be demonstrated using the Nursery dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/nursery).\n", + "\n", + "The method `get_nursery_dataset_pd()` preprocesses the data such that all categorical features are one-hot encoded, and all the features are scaled." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Load data\n", + "Load the nursery dataset with preprocessing and divided into a training and a test (holdout) dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.insert(0, os.path.abspath('..'))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from apt.utils.dataset_utils import get_nursery_dataset_pd\n", + "\n", + "(x_train, y_train), (x_test, y_test) = get_nursery_dataset_pd(raw=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### A simplistic synthetic data generator\n", + "We are using here a simple synthetic data generator just for testing purposes." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from sklearn.neighbors import KernelDensity\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.decomposition import PCA\n", + "import numpy as np\n", + "\n", + "\n", + "def kde(n_samples, n_components, original_data):\n", + " \"\"\"\n", + " Simple synthetic data generator: estimates the kernel density of data using a Gaussian kernel and then generates\n", + " samples from this distribution\n", + " \"\"\"\n", + " digit_data = original_data\n", + " pca = PCA(n_components=n_components, whiten=False)\n", + " data = pca.fit_transform(digit_data)\n", + " params = {'bandwidth': np.logspace(-1, 1, 20)}\n", + " grid = GridSearchCV(KernelDensity(), params, cv=5)\n", + " grid.fit(data)\n", + "\n", + " kde_estimator = grid.best_estimator_\n", + "\n", + " new_data = kde_estimator.sample(n_samples, random_state=0)\n", + " new_data = pca.inverse_transform(new_data)\n", + " return new_data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Generate synthetic data based on the training data provided using the above simple synthetic data generator." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from apt.utils.datasets import ArrayDataset\n", + "\n", + "NUM_SYNTH_SAMPLES = 1000\n", + "num_synth_components = 4\n", + "synthetic_data = ArrayDataset(\n", + " kde(NUM_SYNTH_SAMPLES, n_components=num_synth_components, original_data=x_train))\n", + "original_data_members = ArrayDataset(x_train, y_train)\n", + "original_data_non_members = ArrayDataset(x_test, y_test)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run dataset assessment attacks using the DatasetAssessmentManager\n", + "Run all the dataset assessment attacks and get all their scores." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from apt.risk.data_assessment.dataset_assessment_manager import DatasetAssessmentManager\n", + "\n", + "mgr = DatasetAssessmentManager()\n", + "[score_g, score_h] = mgr.assess(original_data_members, original_data_non_members, synthetic_data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can look at the detailed scores of all the attacks:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[DatasetAttackScoreMembershipKnnProbabilities(dataset_name='dataset', risk_score=0.5247189047081302, result=DatasetAttackResultMembership(member_probabilities=array([0.01112053, 0.03040544, 0.00952443, ..., 0.0425625 , 0.01733997,\n", + " 0.0203852 ]), non_member_probabilities=array([0.01553551, 0.01538259, 0.01611245, ..., 0.01016964, 0.01561895,\n", + " 0.01174237])), roc_auc_score=0.5247189047081302, average_precision_score=0.8141482366545616, assessment_type='MembershipKnnProbabilities'),\n", + " DatasetAttackScoreWholeDatasetKnnDistance(dataset_name='dataset', risk_score=0.841, result=None, share=0.841, assessment_type='WholeDatasetKnnDistance')]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[score_g, score_h]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Or you can look at only the privacy risk scores of all the attacks:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.5247189047081302, 0.841]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[score_g.risk_score, score_h.risk_score]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Run dataset assessment attacks directly" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### DatasetAttackMembershipKnnProbabilities\n", + "Run the privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of\n", + "members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset.\n", + "The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure.\n", + "The ROC curve is displayed and saved in a file `nursery_kde_roc_curve.png`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1036/1036 [00:11<00:00, 87.53it/s] \n", + "100%|██████████| 259/259 [00:02<00:00, 109.32it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "DatasetAttackScoreMembershipKnnProbabilities(dataset_name='nursery_kde', risk_score=0.5246348071734173, result=DatasetAttackResultMembership(member_probabilities=array([0.01112053, 0.03040544, 0.00952443, ..., 0.01370366, 0.03162697,\n", + " 0.02039033]), non_member_probabilities=array([0.01553551, 0.01538259, 0.01611245, ..., 0.02506744, 0.02278329,\n", + " 0.01016964])), roc_auc_score=0.5246348071734173, average_precision_score=0.8140989865974944, assessment_type='MembershipKnnProbabilities')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAHHCAYAAABDUnkqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAACKF0lEQVR4nOzddVxU2fsH8M/QoISKqCAKdiuKotiKYrdgo67duXbnfl1r1dV1DWywdQ3swC6wUEzEAAyke+b8/vDn1VlyWIb8vF8vXst97jl3nrmLzMO9554jE0IIEBEREeUSGlmdABEREVFGYnFDREREuQqLGyIiIspVWNwQERFRrsLihoiIiHIVFjdERESUq7C4ISIiolyFxQ0RERHlKixuiIiIKFdhcUNERES5CosbIkqVq6srZDKZ9KWlpQULCwv0798f79+/T7KPEAI7duxAo0aNYGJiAgMDA1StWhXz589HZGRksq916NAhtG7dGqamptDR0YG5uTmcnJxw/vx5db09IsplZFxbiohS4+rqigEDBmD+/PmwtrZGTEwMbty4AVdXV1hZWeHRo0fQ09OT2svlcvTq1Qt79+5Fw4YN0aVLFxgYGMDT0xO7d+9GpUqVcPbsWRQpUkTqI4TAwIED4erqChsbG3Tr1g1FixZFQEAADh06hLt37+Lq1auwt7fPilNARDmJICJKxdatWwUAcfv2baX4lClTBADh7u6uFF+8eLEAICZNmpToWEePHhUaGhqiVatWSvFly5YJAGLcuHFCoVAk6rd9+3Zx8+bNDHg36RcREZGlr09EacPbUkSUbg0bNgQAvHz5UopFR0dj2bJlKFeuHJYsWZKoT/v27eHi4gIPDw/cuHFD6rNkyRJUqFABv//+O2QyWaJ+ffv2RZ06dVLMR6FQYPXq1ahatSr09PRQuHBhtGrVCnfu3AEA+Pn5QSaTwdXVNVFfmUyGuXPnSttz586FTCaDj48PevXqhQIFCqBBgwZSfm/evEl0jGnTpkFHRwdfv36VYjdv3kSrVq1gbGwMAwMDNG7cGFevXk3xfRDRf8PihojSzc/PDwBQoEABKXblyhV8/foVvXr1gpaWVpL9+vXrBwA4duyY1Cc4OBi9evWCpqZmuvP55ZdfMG7cOFhaWuK3337D1KlToaenJxVR6dG9e3dERUVh8eLFGDx4MJycnCCTybB3795Ebffu3YuWLVtK5+P8+fNo1KgRwsLCMGfOHCxevBghISFo1qwZbt26le6ciChlSf/mISJKQmhoKD5//oyYmBjcvHkT8+bNg66uLtq1aye18fHxAQBUr1492eN83/fkyROl/1atWjXduV24cAGurq4YM2YMVq9eLcUnTpwI8R+GFlavXh27d+9WitWtWxfu7u6YPHmyFLt9+zZevXolXf0RQmDYsGFo2rQpTp48KV2NGjp0KCpXroyZM2fi9OnT6c6LiJLHKzdElGYODg4oXLgwLC0t0a1bN+TLlw9Hjx5F8eLFpTbh4eEAAENDw2SP831fWFiY0n9T6pOaAwcOQCaTYc6cOYn2JXWbK62GDRuWKObs7Iy7d+8q3Y5zd3eHrq4uOnbsCADw9vbG8+fP0atXL3z58gWfP3/G58+fERkZiebNm+Py5ctQKBTpzouIksfihojSbN26dThz5gz279+PNm3a4PPnz9DV1VVq871A+V7kJOXfBZCRkVGqfVLz8uVLmJubo2DBguk+RlKsra0Txbp37w4NDQ24u7sD+HaVZt++fWjdurX0Xp4/fw4AcHFxQeHChZW+Nm3ahNjYWISGhmZorkT0DW9LEVGa1alTB7a2tgCATp06oUGDBujVqxd8fX2RP39+AEDFihUBAA8ePECnTp2SPM6DBw8AAJUqVQIAVKhQAQDw8OHDZPtkhOSu4Mjl8mT76OvrJ4qZm5ujYcOG2Lt3L6ZPn44bN27A398fv/32m9Tm+1WZZcuWoUaNGkke+/s5I6KMxSs3RJQumpqaWLJkCT58+IC1a9dK8QYNGsDExAS7d+9OtmjYvn07AEhjdRo0aIACBQpgz549KRYaKSldujQ+fPiA4ODgZNt8H+gbEhKiFE/qyafUODs74/79+/D19YW7uzsMDAzQvn17pXyAb1elHBwckvzS1tZW+XWJKHUsbogo3Zo0aYI6depg1apViImJAQAYGBhg0qRJ8PX1xYwZMxL1OX78OFxdXeHo6Ii6detKfaZMmYInT55gypQpSQ4A3rlzZ4pPGHXt2hVCCMybNy/Rvu/HMzIygqmpKS5fvqy0/88//0z7m/7p9TQ1NbFnzx7s27cP7dq1Q758+aT9tWrVQunSpfH7778jIiIiUf9Pnz6p/JpElDa8LUVE/8nkyZPRvXt3uLq6SoNvp06dCi8vL/z222+4fv06unbtCn19fVy5cgU7d+5ExYoVsW3btkTHefz4MZYvX44LFy5IMxQHBgbi8OHDuHXrFq5du5ZsHk2bNkXfvn3xxx9/4Pnz52jVqhUUCgU8PT3RtGlTjBo1CgAwaNAgLF26FIMGDYKtrS0uX76MZ8+eqfy+zczM0LRpU6xYsQLh4eFwdnZW2q+hoYFNmzahdevWqFy5MgYMGAALCwu8f/8eFy5cgJGREf755x+VX5eI0iArZxAkopwhuRmKhRBCLpeL0qVLi9KlS4uEhASl+NatW0X9+vWFkZGR0NPTE5UrVxbz5s1Lcabf/fv3i5YtW4qCBQsKLS0tUaxYMeHs7CwuXryYap4JCQli2bJlokKFCkJHR0cULlxYtG7dWty9e1dqExUVJX755RdhbGwsDA0NhZOTk/j48aMAIObMmSO1mzNnjgAgPn36lOzr/f333wKAMDQ0FNHR0Um28fLyEl26dBGFChUSurq6omTJksLJyUmcO3cu1fdDROnDtaWIiIgoV+GYGyIiIspVWNwQERFRrsLihoiIiHIVFjdERESUq7C4ISIiolyFxQ0RERHlKnluEj+FQoEPHz7A0NDwP60UTERERJlHCIHw8HCYm5tDQyPlazN5rrj58OEDLC0tszoNIiIiSoe3b9+iePHiKbbJc8WNoaEhgG8nx8jIKIuzISIiorQICwuDpaWl9DmekjxX3Hy/FWVkZMTihoiIKIdJy5ASDigmIiKiXIXFDREREeUqLG6IiIgoV2FxQ0RERLkKixsiIiLKVVjcEBERUa7C4oaIiIhyFRY3RERElKuwuCEiIqJchcUNERER5SpZWtxcvnwZ7du3h7m5OWQyGQ4fPpxqn4sXL6JmzZrQ1dVFmTJl4OrqqvY8iYiIKOfI0uImMjIS1atXx7p169LU/vXr12jbti2aNm0Kb29vjBs3DoMGDcKpU6fUnCkRERHlFFm6cGbr1q3RunXrNLffsGEDrK2tsXz5cgBAxYoVceXKFaxcuRKOjo7qSpOIiIjS4EtIDCLlcujpaMLMUC/L8shRY26uX78OBwcHpZijoyOuX7+ebJ/Y2FiEhYUpfREREVHGSZArsGy3N4qWXgWbnnsxbMfdLM0nRxU3gYGBKFKkiFKsSJEiCAsLQ3R0dJJ9lixZAmNjY+nL0tIyM1IlIiLK9YQQmHHoIUoMOYBf+x1FQnAsgs+/xYP7QVmaV44qbtJj2rRpCA0Nlb7evn2b1SkRERHleHf8gmE97QR23fSHtqk+9K2NAQCFLY1wYmLjLM0tS8fcqKpo0aIIClKuBoOCgmBkZAR9ff0k++jq6kJXVzcz0iMiIsr1AkKjUW/JeaWYTCbDuSM9cWLPI8yc2Qi6ullbXuSo4qZevXo4ceKEUuzMmTOoV69eFmVERESU+4XFxOPXfQ/g8TgQQgiE3/sI7UL60LcyQhcbCyx3qg6ZTIZ6C5pldaoAsri4iYiIwIsXL6Tt169fw9vbGwULFkSJEiUwbdo0vH//Htu3bwcADBs2DGvXrsWvv/6KgQMH4vz589i7dy+OHz+eVW+BiIgo15r/jw+2XH0tbctjEvDlpB+in4VAJ7827vmOhoW5YRZmmLQsHXNz584d2NjYwMbGBgAwYcIE2NjYYPbs2QCAgIAA+Pv7S+2tra1x/PhxnDlzBtWrV8fy5cuxadMmPgZORESUwXbf9FcqbGI/RCDA1QfRz0IAAHER8Th54nkWZZcymRBCZHUSmSksLAzGxsYIDQ2FkZFRVqdDRESUrXwMi0G9pechV3wrD4QQaCPTwablN5GQoAAAFCyoD1fXjmjfvnym5aXK53eOGnNDRERE6rPzxhvMPPxI2pZHJ8D8fjA2XPpxF8Xe3hJ79nRFiRLGWZFimuT6R8GJiIgoda8/RyoVNtYJMuDQK9z6qbCZMqU+Ll50ydaFDcArN0RERHlaTLwc1eaeRpxcIcUmNimDOb0O4+vXGACAqakBtm/vhNaty2ZVmirhlRsiIqI8au7Rx6gwy0OpsBlY3xqjW5XHunVtAAANG5aAt/fQHFPYALxyQ0RElOfEJShQZc4ppaJGCAHv2S1RIJ8OAKBnz6rQ19dGu3bloKWVs66FsLghIiLKQ4QQKDfz5I9thYCTtj4iv8ZIhc13nTpVyOz0MgSLGyIiojziU3gsai86K23LI+Nh/SgEv198A+DbLaiePatmVXoZhsUNERFRLhcUFoMtV17jr8uvpFi0XxhkF9/DMygSAKChIcO7d2FZlWKGYnFDRESUi/XYeB03XgVL20IhoPPgC/xP++H7NL7FiuXH7t1d0aSJVdYkmcFY3BAREeVCCoVA498v4G1wtBQzlckQff4dfO4GSrGWLUtjx47OMDPLlxVpqgWLGyIiolym6txTCI9JUIqtsC+DYYOO4tOnKACApqYMCxY0xZQpDaChIcuKNNWGxQ0REVEu8fBdKNqvvZIo/mxhK7RtvVsqbCwsDOHm1g0NGpTI7BQzBYsbIiKiXGD0Hi/8c/+DUuz5otbQ1vw2R82OHZ1RvfoG2NqaY9u2TjA1NciKNDMFixsiIqIcbu/tt0qFzYgmpTG8vrVU2ABA0aL5cePGLyhZ0iTX3Yb6t5w15SAREREpef05Er8eeCBtX5zYGJ/O+aNq1fUI/mkwMQBYWxfI9YUNwCs3REREOZIQAhP33sdBr/dSbHL9UujZwR03brwDAAwYcASHDztDJsv9Bc3PWNwQERHlMPFyBbpvuA7vtyFSrK6GNmb2O4qQkG8reWtra6BZM6usSTCLsbghIiLKQd58iUTjZRelbSFXoHWUBv7687oUs7Y2gbt7N9SubZEFGWY9FjdEREQ5xCbPV1h4/Im0HR8Si4I3PuKv+0FSrFu3Sti0qT2MjfWyIsVsgcUNERFRNjdmjxeO/usx75oKDZzb8wwfwmIBADo6mli50hHDh9vmuTE2/8bihoiIKJt6+SkCzZdfShT/X9dq+HovCIf+v7ApU6Yg9u7tBhubYpmdYrbE4oaIiCib8XgUiGE77yaKuw6ojSblzQAAwrY4Llzwg4aGDH/91Q6GhrqZnWa2xeKGiIgoGxmw9RYu+H5SimlryrCxbRWpsAEAmUyG7ds7Q1tbI8/fhvo3TuJHRESUDRz2eg+rqceVCpsB9a3wZE5LNPssR7P6W/HPP75KfXR0NFnYJIFXboiIiLLY3KOP4XrNTyl2c3pzfP0QATu7TXj48CMAwMXlMJ49G52r14XKCCxuiIiIslDX9ddw981Xabu/vRVmt6uEnTsfYPjw44iKigcA6OtrYcUKRxY2acDihoiIKAtc9P2I/ltvK8VuzWiOfBoa+OWXo3B19ZbilSsXxt693VGpUuFMzjJnYnFDRESUyWYefoidN/yVYrdnOOCjfyiaOu2Hj8+PcTcDB9bAmjVtYGCgndlp5lgsboiIiDJJaHQ8Jrh749zTj1KsZ50SWNKlKv75xxfOzvsRHZ0AAMiXTxsbNrRDnz7VsirdHIvFDRERUSY47PUe49y9lWLes1vAxEAHAFC1ahHo6mohOjoB1aoVwd693VC+vGkWZJrzsbghIiJSow8h0bBfel4pJpMBp8c1kgobALCyMoGra0ecPPkCK1c6Ql+ft6HSi8UNERGRmgSFxSQqbLYNrINGZU2xa9dDFO1YXmlm4Y4dK6BjxwqZnWauw0n8iIiI1GDGoYewW3xO2tbSkMFvaVvYFDVCz54H0LfvIQwbdhxCiCzMMndicUNERJTBTj8OxK6bP56GKmWaD+cnNsG9ewGoWfMvuLs/BgDs3v0Q16+/y6o0cy3eliIiIspA//N4ij8vvpS2T41rhHJF8mPdutuYOPE04uLkAABjY11s3twB9vaWWZVqrsXihoiIKIN0/vMqvPxDpO2tA2qjiJ42unXbh4MHn0jx2rXN4e7eDdbWBbIgy9yPxQ0REVEGCAiNVipsDo2wR3xgFGxa/QU/vx/x8ePrYulSB+joaGZ+knkEixsiIqL/SAiBekt+PBX1eJ4jnj76iAYNtiA+XgEAKFBAD66undChQ/msSjPPSFdx4+/vjzdv3iAqKgqFCxdG5cqVoaurm3pHIiKiXOZtcBQa/u+CtG1ZUB/5dLVgY1MMLVuWxvHjz1GvXnG4uXVDiRLGWZhp3pHm4sbPzw/r16+Hm5sb3r17p/Tomo6ODho2bIghQ4aga9eu0NDgQ1hERJT7zT7yCNuvv1GKnRnfGACgoSHDtm2dsH79HUyZUh/a2rwNlVnSVIWMGTMG1atXx+vXr7Fw4UL4+PggNDQUcXFxCAwMxIkTJ9CgQQPMnj0b1apVw+3bt1M/KBERUQ4UHhOP2UcewWrqcamwEUIgn28ItrSsBL2fiphChQwwc2YjFjaZLE1XbvLly4dXr16hUKFCifaZmZmhWbNmaNasGebMmQMPDw+8ffsWtWvXzvBkiYiIstL7kGjU/9eMw/KoeJR5EoaLZ1+j941AeHsPRZEi+bMoQwIAmchjUyOGhYXB2NgYoaGhMDIyyup0iIgoh/ANDIfjqstKse4WBbF76XV8+BAO4NuaUa6undCvX/WsSDFXU+Xzm09LERERpcD/SxQaLbugFKtTsgCqBydg9rgzUCi+XSMwM8uHnTs7o0WL0lmRJv0kw4qbJ0+eoG3btnj16lVGHZKIiCjLJHULCgDKG+sj5NBLzDzz4/OuaVMr7NrVBcWKGWZmipSMDCtu4uLi8ObNm9QbEhERZXMejwIxbOddpVibqkXRrVhB9O59EIGBEQC+3YaaM6cxZs5sBE1NPimcXaS5uJkwYUKK+z99+vSfkyEiIspK4THxqDr3tFKsYVlTbHapjaiIOJQsuQphYbEAgKJF82P37i5o2tQ6K1KlFKS5uFm9ejVq1KiR7CCeiIiIDEuKiIgoMwkh4PzXDdzyC1aKz21fCf3rfytedEz0sG5dG/TtewgtWpTCzp1dYGaWLyvSpVSkubgpU6YMxo8fjz59+iS539vbG7Vq1cqwxIiIiDLDrptvMOPQI6VY+SKGODW+Ef79QHGfPtVgYqKHNm3KQkNDlplpkgrSXNzY2tri7t27yRY3Mpks0Q8BERFRdvUkIAytV3smil+Z0hRFDfUwc+Z5fP0ajXXr2irtb9euXGalSOmU5uJm+fLliI2NTXZ/9erVoVAoMiQpIiIidZqw1xsH771Xim3sWwstKxfFu3dhaNZxGzw9/QEAjRtbwcmpclakSemU5uKmaNGi6syDiIhIrYQQGO/ujcPeH5TiPeuUwJIuVQEAJ048R79+h/DlSzQAQFNThqAgjinNaTiJHxER5Xo3X32B88YbieKXJzdFiUIGiI+XY8aM81i27Jq0r0QJY7i5dUW9epaZmSplABY3RESUawkhYL/0PAJCY5TiWwfURtPyZgAAf/9Q9OixH9evv5P2d+hQHlu3dkTBgvqZmi9lDBY3RESUK737GoUGvykvmzC7XSUMbPBjXpqjR33Rv/9hfP36rfjR1tbA//7XAmPH2kEm49NQORWLGyIiylVi4uWY948P9tzyV4o/nNsShnra0rYQAqtW3ZAKGysrE+zd2w21a1tkar6U8bJ8ruh169bBysoKenp6sLOzw61bt1Jsv2rVKpQvXx76+vqwtLTE+PHjERMTk2IfIiLKOyrM8lAqbOysC+L1kjZKhQ3wbQqTnTu7oHBhA3TpUhFeXkNZ2OQS6bpyc/nyZRgYGMDW1laK3blzB1FRUWjUqFGaj+Pu7o4JEyZgw4YNsLOzw6pVq+Do6AhfX1+YmZklar97925MnToVW7Zsgb29PZ49e4b+/ftDJpNhxYoV6XkrRESUCwghMMD1Ni76Ki8FdGJMQ1Qy/zGzfmhoDIyN9aRtc3ND3LkzBJaWRrwNlYvIRDpm3tPQ0ECFChXg4+MjxSpWrIhnz55BLpen+Th2dnaoXbs21q5dCwBQKBSwtLTE6NGjMXXq1ETtR40ahSdPnuDcuXNSbOLEibh58yauXLmSptcMCwuDsbExQkNDk11KgoiIco6XnyLQfPmlRHG/pT8m34uJScDkyadx7Nhz3Ls3BAUKcKBwTqPK53e6rty8fv0a2trKl/fOnTuH+Pj4NB8jLi4Od+/exbRp06SYhoYGHBwccP369ST72NvbY+fOnbh16xbq1KmDV69e4cSJE+jbt2+yrxMbG6s0+WBYWFiacyQiouxtyv4HcL/zVim2f1g91CpZQNp+8SIYTk774OUVCAAYOPAoDh504pWaXCxdxU3JkiUTxczNzVU6xufPnyGXy1GkSBGleJEiRfD06dMk+/Tq1QufP39GgwYNIIRAQkIChg0bhunTpyf7OkuWLMG8efNUyo2IiLK3pOatGdW0DCY5lleKubs/wuDB/yA8PA4AoKenhdaty2RanpQ1snxAsSouXryIxYsX488//8S9e/dw8OBBHD9+HAsWLEi2z7Rp0xAaGip9vX37Ntm2RESU/S085pOosLk700GpsImOjsewYcfQo8cBqbApX74Qbt4chCFDavGqTS6Xpis3BQoUSPMPQnBwcOqNAJiamkJTUxNBQUFK8aCgoGSXepg1axb69u2LQYMGAQCqVq2KyMhIDBkyBDNmzICGRuJaTVdXF7q6umnKiYiIsi+FQqDU9BNKsQUdK6NvPSulmK/vZzg57ceDBz8+X/r2rYY//2yL/Pl1MiNVymJpKm5WrVqV4S+so6ODWrVq4dy5c+jUqROAbwOKz507h1GjRiXZJyoqKlEBo6mpCQBckZyIKJf7/bSv0vat6c1hZqSnFNu9+yGGDPkHkZHfxoDq62th3bo26N+/Bq/W5CFpKm5cXFzU8uITJkyAi4sLbG1tUadOHaxatQqRkZEYMGAAAKBfv36wsLDAkiVLAADt27fHihUrYGNjAzs7O7x48QKzZs1C+/btpSKHiIhyl0Hb7uDsE+Wr/K+XtEmyWAkJiZEKm0qVCmPv3m6oXDnx1CKUu6VrQPHLly+xdetWvHz5EqtXr4aZmRlOnjyJEiVKoHLltC8L7+zsjE+fPmH27NkIDAxEjRo14OHhIQ0y9vf3V7pSM3PmTMhkMsycORPv379H4cKF0b59eyxatCg9b4OIiLK5Br+dx7uv0Uqxlc7Vk70KM3y4LS5c8IOhoQ7WrGmNfPl4GyovUnmem0uXLqF169aoX78+Ll++jCdPnqBUqVJYunQp7ty5g/3796sr1wzBeW6IiHKGdms88ej9j+k7tg2sg8blCkvbQgjcvRsAW1vlp3Xj4+XQ1ubV/NxGlc9vlZ+Wmjp1KhYuXIgzZ85AR+dHRdysWTPcuJF4OXkiIqK0iopLwOqzz2E19bhSYXNjWnOlwiYiIg79+h1G7dp/48SJ50rHYGFDKt+WevjwIXbv3p0obmZmhs+fP2dIUkRElLd4PArEmvPP8fhD4olWb0xrjqI/LZnw4EEQnJz2wdf3CwCgX79DePFiDExM9BL1pbxJ5eLGxMQEAQEBsLa2Vop7eXnBwoILjhERkWqsph5PMj6mWRlMaPlj7hohBP7++x7GjDmJ2NhvS/0YGupg7do2LGxIicrFTY8ePTBlyhTs27cPMpkMCoUCV69exaRJk9CvXz915EhERLlQglyBMjNOKsU61jDHmOZlUbpwfqV4WFgshg49Bje3R1LMxqYo3N27oWzZQpmSL+UcKhc3ixcvxsiRI2FpaQm5XI5KlSpBLpejV69emDlzpjpyJCKiXOR9SDQWn3iC4w8ClOIvF7eBpkbip6C8vALg5LQfL178mCR25Mja+P33ltDTS9dDv5TLpWtVcODbY9qPHj1CREQEbGxsULZs2YzOTS34tBQRUdY5eO8dJuy9nyj+8wrePztwwAe9eh1EXNy321DGxrrYvLkDunatpNY8KftR+6rgAFCiRAlYWloCAGd9JCKiVN1981WpsKle3BjjWpRD0/LJT7JXs2Yx6OtrIS5Ojtq1zeHm1g2lShVItj0RkM6FMzdv3owqVapAT08Penp6qFKlCjZt2pTRuRERUS6x+6Y/uq6/Jm3/r2s1HBnVIMXCBgCsrQtgy5aOGDfODleuDGRhQ2mi8pWb2bNnY8WKFRg9ejTq1asHALh+/TrGjx8Pf39/zJ8/P8OTJCKinCk4Mg5zjj7GP/c/SLHpbSrAqbZlorZCCGzZ4gVn5ypKC1x26VIRXbpUzJR8KXdQecxN4cKF8ccff6Bnz55K8T179mD06NHZfq4bjrkhIsocC4/5YNOV10qx37tXR7daxRO1DQ6OxoABR3D0qC/69auObds6ZVKWlFOodcxNfHw8bG1tE8Vr1aqFhIQEVQ9HRES5zGGv9xjn7p0ofnCEPWqWSHxb6fr1t+jR4wD8/UMBANu338eYMXVQq5Z5orZEaaFycdO3b1+sX78eK1asUIpv3LgRvXv3zrDEiIgoZ3keFI4WKy8niu8bVg+1rQomiisUAsuXX8P06eeRkKAAABQqpI/t2zuzsKH/JE3FzYQJE6TvZTIZNm3ahNOnT6Nu3boAgJs3b8Lf35+T+BER5UEx8XJUnXsK8XLlUQ49altiUeeqSc5d8/lzFFxcDiutC9WgQQns2dMVxYtzyAD9N2kqbry8vJS2a9WqBQB4+fIlAMDU1BSmpqZ4/PhxBqdHRETZ2ZeIWNRaeFYpVqtkARwYbp9sH0/PN+jZ8wDevw8HAMhkwPTpDTF3bhNoaaXrIV4iJWkqbi5cuKDuPIiIKIcJDI1B3SXnlGK+C1tBVyv5Vblv3HiHpk23Qf7/V3kKFzbArl1d0KJFabXmSnkL560mIiKVBEfGwemv63jxMUIpntwswz+rU8cCLVqUhofHCzRtaoVdu7qgWDFDdaVKeVS6ips7d+5g79698Pf3R1xcnNK+gwcPZkhiRESU/aw6+wyrzj5XiqV2G+pnGhoybN/eCVu3emPixHrQ1ORtKMp4Kv9Uubm5wd7eHk+ePMGhQ4cQHx+Px48f4/z58zA2NlZHjkRElA3sueWvVNhUtzTBrRnNky1s5HIF5s+/hEuX/JTihQvnw6+/1mdhQ2qTrlXBV65ciZEjR8LQ0BCrV6+GtbU1hg4dimLFiqkjRyIiykKxCXLUW3IewZE/rtT/M6oBqhZP/g/agIBw9OlzCOfPv4a5uSG8vYeicOF8mZEukepXbl6+fIm2bb/dV9XR0UFkZCRkMhnGjx+PjRs3ZniCRESUNWLi5Wi58hLKz/RQKmzch9RNsbA5c+YlatT4C+fPf5udODAwAhcu+Kk7XSKJylduChQogPDwb4/vWVhY4NGjR6hatSpCQkIQFRWV4QkSEVHWqDDLQ2m7UjEjHBhuD32dpJ+GSkhQYO7ci1i82BPfF/YxNzfEnj1d0ahRSXWnSyRRubhp1KgRzpw5g6pVq6J79+4YO3Yszp8/jzNnzqB58+bqyJGIiDJRXIICTn9dV4p5/toUlgUNku3z7l0YevU6AE9PfynWunUZbNvWibejKNOpXNysXbsWMTExAIAZM2ZAW1sb165dQ9euXTFz5swMT5CIiDLPPf+v6PLnNaXYs4WtoZPC5HonTz5H376H8OVLNABAU1OGxYubY9Ike2gkMTsxkbqpvCp4TsdVwYmIElMoBNquuYInAWFSzFhfG5cnN4WxgXay/T5/joKV1SpERsYDACwtjeDm1g329pZqz5nylgxfFTwsLCz1Rv+PBQMRUc7yKTwWtRcpL6Ewu10lDGxgnWpfU1MDrF3bBgMGHEGHDuWxdWtHFCyor65UidIkTcWNiYkJZLKULy0KISCTySCXyzMkMSIiUr+YeHmiwubW9OYwM9JLts/33/ff9e9fA0WK5EOrVmVS/awgygxcW4qIKA+KiZcnehrKxEAb92a2SHacTFycHFOnnkVCggJ//NFaaV/r1mXVliuRqtJU3DRu3FjdeRARUSaQKwRG7roHj8eBSnHT/Dq4M7NFsv1ev/6KHj0O4Nat9wCAxo1LomvXSmrNlSi9uHAmEVEecdH3I/pvvZ0o/mJRa2ilsBTCwYNPMHDgEYSGxgIAdHQ08fVrjNryJPqvWNwQEeUBc48+hus1P6XYhj610KpK0WT7xMYmYNKk01i79kdBVLp0Abi7d0OtWubqSpXoP2NxQ0SUy1lNPa60PbNtRQxqWCrFPi9eBMPZeT/u3QuQYs7OlbFxY3sYGemqJU+ijMLihogolxJCwHraCaXYnsF1Ua90oRT7ubs/wuDB/yA8/Nt6Urq6mli9uhWGDKnFp6EoR0hXcZOQkICLFy/i5cuX6NWrFwwNDfHhwwcYGRkhf/78GZ0jERGp6ENINOyXnleK+S5sBV2tpNeF+k6hEFi37rZU2JQrVwh793ZD9erJ374iym5ULm7evHmDVq1awd/fH7GxsWjRogUMDQ3x22+/ITY2Fhs2bFBHnkRElAZ333xF1/XXEsVfL2mTpqsuGhoy7N7dFTVqbEDr1mWxfn1b5M+vo45UidRG5eJm7NixsLW1xf3791Go0I9Lm507d8bgwYMzNDkiIkq72ovO4lN4rFKsQ3Vz/NHTJsV+X79Go0CBH7MKFy9uBG/vYbCwMORtKMqRVC5uPD09ce3aNejoKFfyVlZWeP/+fYYlRkREafPP/Q8YvcdLKdauWjGs7VUzxX5RUfEYM+YkLlzww717Q2Bs/GNW4uLFuZQO5VwqFzcKhSLJJRbevXsHQ0PDDEmKiIhSp1AIVJl7ClFxyr+TfeY7wkAn5V/vPj6f4OS0D48ffwIADBr0D/bu7cYrNZQrJD9rUzJatmyJVatWSdsymQwRERGYM2cO2rRpk5G5ERFRMqLiElBq+gmlwmZGm4rwW9o21cLG1dUbtrYbpcLGwEAbHTqUY2FDuYZMCCFU6fDu3Ts4OjpCCIHnz5/D1tYWz58/h6mpKS5fvgwzMzN15ZohVFkynYgou1EoBJad9sX6iy+V4vdnt4SxgXaKfSMi4jBy5Als335filWtaoa9e7ujQgVTteRLlFFU+fxWubgBvj0K7ubmhgcPHiAiIgI1a9ZE7969oa+f/Ze5Z3FDRDlZnUVn8fGnQcOVihnhxNiGqfZ7+DAITk778fTpZyk2eHBNrF7dCvr6KRdFRNmBKp/fKo+5iYmJgZ6eHvr06ZPuBImISDXxcgXKzjipFFvbywbtqqW+DMKWLV4YOfIEYmISAAD58+tg48Z26NmzqlpyJcpqKo+5MTMzg4uLC86cOQOFQqGOnIiI6Cde/l8TFTbPF7VOU2EDfLsd9b2wqVGjKO7dG8LChnI1lYubbdu2ISoqCh07doSFhQXGjRuHO3fuqCM3IqI879H7UHT+U3lSPp/5jtBOYRXvfxs9ug46d66AkSNr4/r1X1C2bMrLLxDldOkacwMA4eHh2L9/P/bs2YPz58+jVKlS6NOnD2bPnp3ROWYojrkhopxik+crLDz+RNoe3qQ0prSqkGIfIQRu3XoPO7viSvGEBAW0tFT+e5Yo21D7gOJ/8/HxQe/evfHgwYMk58DJTljcEFF2F5sgx6jdXjjjEyTFOttYYKVzjRT7hYbGYNCgf7B/vw88PHrD0bGMmjMlyjxqHVD8XUxMDI4ePYrdu3fDw8MDRYoUweTJk9N7OCIiQuKrNQCwe5Ad7Muk/Kj2nTsf4OS0D69fhwAA+vY9hJcvx8DQUFddqRJlWyoXN6dOncLu3btx+PBhaGlpoVu3bjh9+jQaNWqkjvyIiPKMJwFhiQqbf0Y1QNXixsn2EULgjz9uYvLkM4iP//aQh4mJHjZubM/ChvIslYubzp07o127dti+fTvatGkDbW3Oj0BElBFar/aUvj82ugGqWCRf1ABAcHA0Bg48giNHfKVY3brF4ebWFSVLmqgrTaJsT+XiJigoiGtIERFloBcfw+Gw4rK03a9eyVQLmxs33sHZeT/8/UOl2KRJ9bB4cXNoa2uqLVeinCBNxU1YWJg0eEcIgbCwsGTbcpAuEVHa/XtFbw0ZMK9D5RT77Nr1AP37H0FCwrfbUIUK6WPbtk5o27acWnMlyinSVNwUKFAAAQEBMDMzg4mJSZKLqwkhIJPJsv3TUkRE2UWrVZfxNDBc2q5bqiB2/mKX6gKWdnbFoa+vhfDwONSvbwk3t24oXpx/WBJ9l6bi5vz58yhYsCAA4MKFC2pNiIgoL7Caelxpe9vAOmhcrnCa+pYpUxCbNnWAt3cg5s9vyvlriP5F5Xlu/P39YWlpmegvCyEE3r59ixIlSmRoghmN89wQUVZSKASG7LiDs08+SrFXi9tAQyPpqzUKhcDGjXfRt2815Munk1lpEmU7ap3nxtraWrpF9bPg4GBYW1vzthQRUTJ+P+WLtRdeKMVeL2mT7G2ojx8j0bfvIZw+/RK3br3Hli0dMyNNohxP5WuZ38fW/FtERAT09PQyJCkiotxmw6WXiQqbgyPsky1sLl70Q40aG3D69EsAgKurNx48CEqyLREpS/OVmwkTJgAAZDIZZs2aBQMDA2mfXC7HzZs3UaNGjQxPkIgop/MNDMfSk0+l7QPD66FWyYJJtpXLFVi0yBPz5l2CQvFt1ECRIvmwa1cXVKtWJFPyJcrp0nzlxsvLC15eXhBC4OHDh9K2l5cXnj59iurVq8PV1VXlBNatWwcrKyvo6enBzs4Ot27dSrF9SEgIRo4ciWLFikFXVxflypXDiRMnVH5dIiJ1+83jKaymHofjqh9z2PzVt1ayhU1gYARattyJOXMuSoVN8+bW8PYehubNS2VKzkS5QZqv3Hx/SmrAgAFYvXp1hgzGdXd3x4QJE7BhwwbY2dlh1apVcHR0hK+vb6IxPQAQFxeHFi1awMzMDPv374eFhQXevHkDExOT/5wLEVFGUSgErrz4jPUXXyrFa1sVgGPlokn2OXv2Ffr0OYigoEgAgIaGDPPmNcG0aQ2gqcmnoYhUkSGrgqeXnZ0dateujbVr1wIAFAoFLC0tMXr0aEydOjVR+w0bNmDZsmV4+vRpupd94NNSRKROu26+wYxDj5RiqT3mfemSH5o23Ybvv43NzQ2xe3cXNG5spcZMiXKWDH9aqkuXLnB1dYWRkRG6dOmSYtuDBw+mKcm4uDjcvXsX06ZNk2IaGhpwcHDA9evXk+xz9OhR1KtXDyNHjsSRI0dQuHBh9OrVC1OmTIGmZtLTjcfGxiI2NlbaTml2ZSKi/+L806BEhc3CTlVSnb+mYcOScHAohTNnXqFVqzLYvr0TChfOp85UiXK1NBU3xsbG0oh+Y+OU1ztJq8+fP0Mul6NIEeUBckWKFMHTp0+T7PPq1SucP38evXv3xokTJ/DixQuMGDEC8fHxmDNnTpJ9lixZgnnz5mVIzkREyRFCYKDrHWl71yA72JculOpsw8C3W1A7dnTGnj2PMGaMXbJz3hBR2mTZbakPHz7AwsIC165dQ7169aT4r7/+ikuXLuHmzZuJ+pQrVw4xMTF4/fq1dKVmxYoVWLZsGQICApJ8naSu3FhaWvK2FBFlmIu+H9F/621pe2ijUpjWpmKSbePj5Zg9+wLati2HBg2y96SnRNmJWifxi46OhhBCehT8zZs3OHToECpVqoSWLVum+TimpqbQ1NREUJDyvA1BQUEoWjTpAXfFihWDtra20i2oihUrIjAwEHFxcdDRSTx7p66uLnR1ddOcFxGRKj5HxCoVNgAwtXWFJNu+fRuKHj0O4Nq1t9ix4wG8vYfB1NQgybZElH4qD8Hv2LEjtm/fDuDbY9l16tTB8uXL0bFjR6xfvz7Nx9HR0UGtWrVw7tw5KaZQKHDu3DmlKzk/q1+/Pl68eAGFQiHFnj17hmLFiiVZ2BARqcvTwDB0XHcVtgvPSrGB9a3ht7Rtkreijh17hho1/sK1a28BAEFBkbhyxT/T8iXKS1Qubu7du4eGDRsCAPbv34+iRYvizZs32L59O/744w+VjjVhwgT8/fff2LZtG548eYLhw4cjMjISAwYMAAD069dPacDx8OHDERwcjLFjx+LZs2c4fvw4Fi9ejJEjR6r6NoiI0iUmXo7em26g1SpP3H8bIsV1NDUwu32lRO3j4uSYOPEU2rffg+DgaABAyZLGuHJlADp1SvoKDxH9NyrfloqKioKhoSEA4PTp0+jSpQs0NDRQt25dvHnzRqVjOTs749OnT5g9ezYCAwNRo0YNeHh4SIOM/f39oaHxo/6ytLTEqVOnMH78eFSrVg0WFhYYO3YspkyZourbICJS2chd93D8ofL4vvJFDDGiaWl0qG6eqL2fXwicnffj1q33UqxTpwrYsqUDChTQV3u+RHmVygOKq1WrhkGDBqFz586oUqUKPDw8UK9ePdy9exdt27ZFYGCgunLNEJznhohU9SwoHC1XXk4U3zXIDvXLmCbZ59ChJxg48ChCQmIAADo6mvj99xYYNapOmp6gIiJlah1QPHv2bPTq1Qvjx49Hs2bNpPExp0+fho2NTfoyJiLKhl5/jkTT3y8mip8a1wjlixom2y8oKAK9ex9EdHQCAKBUqQLYu7cbatVKfHWHiDJeuh4FDwwMREBAAKpXry7dNrp16xaMjIxQoUL2vofMKzdElBavPkWg2fJLSrGedSyxuHPVNF152bz5HgYN+gfdu1fC33+3h7GxnrpSJcoTVPn8/k/z3Lx79w4AULx48fQeItOxuCGilCgUAn97vsKSn1bxLlHQAEdH1YeJQfJPZSoUQmnyPSEEzp59BQeHUrwNRZQBVPn8VvlpKYVCgfnz58PY2BglS5ZEyZIlYWJiggULFig9ok1ElJMc9nqPzn9eRanpJ5QKm7bViuHyr02TLWxiYhIwYsRxTJhwSikuk8nQokVpFjZEWUDlMTczZszA5s2bsXTpUtSvXx8AcOXKFcydOxcxMTFYtGhRhidJRKQu4THxqDr3dJL7FnSsjL71rJLt++zZFzg57cP9+98mI23SxIqPdxNlAyoXN9u2bcOmTZvQoUMHKfb9sewRI0awuCGiHKX+0vNK2yOalEYnGwuUK5L8gGEA2L37IYYOPYaIiDgAgL6+lvQ9EWUtlYub4ODgJAcNV6hQAcHBwRmSFBGRugkhMHHvfYTFfHuiybKgPjx/bZZqv6ioeIwdexKbNnlJsYoVTbF3b3dUqWKmtnyJKO1UHnNTvXp1rF27NlF87dq1qF69eoYkRUSkTqcfB8J62gkc9Poxud75iU1S7ffkySfY2W1SKmz696+B27cHs7AhykZUvnLzv//9D23btsXZs2elOW6uX7+Ot2/f4sSJExmeIBFRRvENDIfjqsST8bkNqQttzZT/1tu2zRsjRpxAVFQ8AMDAQBvr17dFv378o44ou1G5uGncuDGePXuGP//8E0+ePAEAdOnSBSNGjIC5OSeoIqLsqc+mm7jy4rNSbFLLchjZtEyqTzTJ5Qps3HhPKmyqVDHD3r3dULFiYbXlS0Tpp1Jx4+fnhzNnziAuLg49evRAlSpV1JUXEVGG2HfnLSbvf6AUa1O1KNb1qpnmx7Q1NTWwZ09X2Nj8hS5dKmD16tYwMNBWR7pElAHSXNxcuHAB7dq1Q3T0t1VttbS0sGXLFvTp00dtyRERpdej96Fot+ZKovj1ac1QzDjlRSuFEAgOjkahQgZSrEQJYzx6NBzFiqX8FBURZb00DyieNWsWWrRogffv3+PLly8YPHgwfv31V3XmRkSULscfBCQqbGa3q4TXS9qkWtiEh8eid++DqFt3M8LCYpX2sbAhyhnSvPyCiYkJrl27hkqVKgEAoqKiYGRkhKCgIBQqVEitSWYkLr9AlLutOfccy888k7YHNbDGzHaV0tTX2zsQTk778Pz5t2ktevSogj17uqolTyJSjVpWBQ8LC4Opqam0bWBgAH19fYSGhuao4oaIcqfQqHiMdvPC5WefpNi+YfVQ26pgqn2FENiw4Q7Gjz+F2Fg5AMDISBddunC2YaKcSKUBxadOnYKxsbG0rVAocO7cOTx69EiK/TxzMRFRZph28AH23HqrFNvYt1aaCpvQ0BgMHvwP9u3zkWK1ahWDu3s3lC6den8iyn7SfFtKQyP14TkymQxyufw/J6VOvC1FlLsEhcXAbvE5adtARxMeYxuhxE+DgZNz584HODvvx6tXX6XYmDF18L//tYCursozZRCRGqnlthRX/Cai7EauEEqFzcO5LWGol7ZHtP/88zbGjfNAfPy3320mJnrYurUjF74kygX4pwkR5UgRsQmoMueUtJ1PRzPNhQ0AxMYmSIWNnZ0F3Ny6wcrKJKPTJKIskKZHwW/cuJHmA0ZFReHx48fpToiIKDUP3oUoFTYA8Gieo0rHGDeuLjp2LI+JE+vh8uUBLGyIcpE0FTd9+/aFo6Mj9u3bh8jIyCTb+Pj4YPr06ShdujTu3r2boUkSEX33z/0P6LD2qrRtZ10QfkvbpjjbsEIhcPWqv1JMJpPhwAEn/P57S+joaKotXyLKfGkqbnx8fNC2bVvMnDkTJiYmqFy5Mlq0aIH27dujQYMGMDU1Rc2aNfH69WucPn0a/fr1U3feRJQHXXn+GaP3/FiR26VeSbgPrZdiny9fotChwx40bLgVZ8++UtqnmcpimUSUM6X5aanv7ty5gytXruDNmzeIjo6GqakpbGxs0LRpUxQsmP0fm+TTUkQ5T3BkHGouOKMU2+xii+YVi6TY7+pVf/TocQDv3oUBAIoWzY+XL8dwXSiiHEgtT0t9Z2trC1tb23QnR0SkipefItB8+SWl2KAG1ikWNgqFwP/+dxUzZ56HXP7t7zdTUwO4unZkYUOUB/BpKSLKtp4FhaPlysvSdslCBjg3oTG0Urid9PFjJPr1O4RTp15KscaNS2L37q4wN+faUER5AYsbIsqWQqPilQqb7rWKY1n36in2uXTJDz17HkBAQAQAQCYDZs1qhFmzGkNLi+NriPIKFjdElO14+X9F5z+vSdu97UpgUeeqKfbZvPkehgw5BoXi222oIkXyYdeuLmjevJRacyWi7IfFDRFlK3OPPobrNT9pu1yR/KkWNgDQsGFJGBhoIyIiDs2bW2Pnzi4oWjS/GjMlouzqPxU3MTEx0NPTy6hciCiPqz7vNEKj46XtUU3LYJJj+TT1LVeuEDZubIcXL4IxfXpDPuZNlIep/K9foVBgwYIFsLCwQP78+fHq1bd5I2bNmoXNmzdneIJElDfsu/NWqbC5NrVZsoWNXK7AH3/cRPRP7QGgZ8+qmDWrMQsbojxO5d8ACxcuhKurK/73v/9BR0dHilepUgWbNm3K0OSIKG8Ii4nH5P0PpO0n81vB3EQ/ybYfPoSjefPtGDvWA2PHemRWikSUg6hc3Gzfvh0bN25E7969oan5Y8ry6tWr4+nTpxmaHBHlfs+DwlFt7mlpe0GnKtBPZjkED48XqF59Ay5degMA2LLFC0+ffs6UPIko51C5uHn//j3KlCmTKK5QKBAfH59EDyKi5LX46XHvmiVM0LduyURtEhIUmDbtLFq33oXPn6MAAMWLG+HSpf6oUME003IlopxB5QHFlSpVgqenJ0qWVP4FtH//ftjY2GRYYkSUe8UmyHHyYSDGuXtLsV52JbA4iaei3r4NRc+eB3D16lsp1q5dObi6dkShQgaZkS4R5TAqFzezZ8+Gi4sL3r9/D4VCgYMHD8LX1xfbt2/HsWPH1JEjEeUiLltu4dKzT4niSRU2x449g4vLYQQHRwMAtLQ0sHRpc0yYUC/FVcCJKG9TeeFMAPD09MT8+fNx//59REREoGbNmpg9ezZatmypjhwzFBfOJMo6VlOPK20XNtRFf3srjGya+Fb3mTMv0bLlTmm7ZEljuLl1Q926xdWeJxFlP6p8fqeruMnJWNwQZb4EuQJlZpxUip0e3wjliiS/1pNcrkDLljtx/vxrdOpUAVu2dECBAkk/QUVEuZ8qn98qDyguVaoUvnz5kigeEhKCUqU4zTkRfaNQCFx98RnrLrxIVNi8XNwmxcIGADQ1NbBrVxf8+WcbHDzoxMKGiNJM5TE3fn5+kMvlieKxsbF4//59hiRFRDnbx7AY1Fl8Lsl9zxa2hqaG8niZ2NgETJlyFk5OlWFvbynFixbNj+HDa6s1VyLKfdJc3Bw9elT6/tSpUzA2Npa25XI5zp07BysrqwxNjohyloXHfLDl6mso/nWzu7ZVAZQtYpjkoOGXL4Ph7Lwfd+8G4NChp/DyGoqCBXmVhojSL83FTadOnQAAMpkMLi4uSvu0tbVhZWWF5cuXZ2hyRJQzPAsKR8uf5qv5rmWlIvirb61kn2zat+8xBg36B2FhsQCAoKAI3Lz5Dq1bl1VrvkSUu6W5uFEoFAAAa2tr3L59G6amnDiLKK8TQmDk7ns48TBQKb6sWzV0srGAdjJrPMXEJGDChFNYv/6OFCtbtiD27u2OGjWKqjVnIsr9VB5z8/r1a3XkQUQ5TL8tt3D5X/PVOFQ0wyaXlMfIPHv2BU5O+3D/fpAU69WrKjZsaAtDQ1215EpEeYvKxQ0AREZG4tKlS/D390dcXJzSvjFjxmRIYkSUPcXEy1FhVuIFKy9NboKShfKl2Hf37ocYOvQYIiK+/d7Q09PCmjWt8csvNpyUj4gyjMrFjZeXF9q0aYOoqChERkaiYMGC+Pz5MwwMDGBmZsbihigXi4hNQJU5p5Ri92e3hLGBdqp9370Lw8CBRxAb++1pywoVTLF3bzdUrVpELbkSUd6l8jw348ePR/v27fH161fo6+vjxo0bePPmDWrVqoXff/9dHTkSURYTQmDRcR+lwsZYXxsvFrVOU2EDfFvocvXqVgAAF5fquHNnMAsbIlILlWcoNjExwc2bN1G+fHmYmJjg+vXrqFixIm7evAkXFxc8ffpUXblmCM5QTJR2jz+EYpybN55/jFCKO1Qsgk0utqn2VygENH6a00YIgYsX/dC0qXWG50pEuZsqn98q35bS1taGhsa3Cz5mZmbw9/dHxYoVYWxsjLdv36bSm4hygp033mDm4UdJ7ktt2QQAiIyMw4gRJ2Bqqo/lyx2luEwmY2FDRGqncnFjY2OD27dvo2zZsmjcuDFmz56Nz58/Y8eOHahSpYo6ciSiTCKEwKjdXjj+MEAp7mRbHBNalEdRY71Uj/Ho0Ud0774PT59+BgA0aWKF9u3LqyVfIqKkqFzcLF68GOHh4QCARYsWoV+/fhg+fDjKli2LzZs3Z3iCRJQ59tzyx7SDD5Vim/rZwqFS2sbFCCGwebMXRo8+iZiYBABAvnza0vdERJmFq4ITESJjE1D5X09B/TOqAaoWN06mh7Lw8FgMG3Ycu3f/KI6qVy+CvXu7o1y5QhmaKxHlTWpdFTw59+7dQ7t27TLqcESUSVaceaZU2PzdzxZ+S9umubDx9g6Ere3fSoXNsGG1cOPGIBY2RJQlVLotderUKZw5cwY6OjoYNGgQSpUqhadPn2Lq1Kn4559/4OjomPpBiChbuPbiM3ptuqkUa1O1KFqocBtqw4Y7GD/+lDR3jaGhDjZt6gAnp8oZni8RUVqlubjZvHkzBg8ejIIFC+Lr16/YtGkTVqxYgdGjR8PZ2RmPHj1CxYoV1ZkrEWUAIQRarfKEb1C4Uvz37tXRtaZFmo+TkKDAtm33pcKmVq1icHfvhtKlC2ZovkREqkrzmJtq1aqhb9++mDx5Mg4cOIDu3bujbt262Lt3L4oXL67uPDMMx9xQXnbz1Rc4b7yhFGtbtRjW9a6ZruP5+YXAxuYv9O1bDcuWtYCubrpWdCEiSpUqn99pLm7y5cuHx48fw8rKCkII6Orq4sKFC6hfv36GJJ1ZWNxQXrXvzltM3v9AKXZrenOYGaX+eDfw7YrPp09RMDNTXj8qKCgCRYrkz7A8iYiSopYBxdHR0TAwMADwbSIuXV1dFCtW7L9l+v/WrVsHKysr6Onpwc7ODrdu3UpTPzc3N8hkMnTq1ClD8iDKrS74flQqbMY2Lwu/pW3TXNh8/RqNrl33omHDrQgPj1Xax8KGiLIbla4hb9q0Cfnzf/tFlpCQAFdXV5iamiq1UXXhTHd3d0yYMAEbNmyAnZ0dVq1aBUdHR/j6+sLMzCzZfn5+fpg0aRIaNmyo0usR5RUvPkZg0r778H4bohRXZe4aALh58x2cnffjzZtQAMCIESewY0fnjEyViChDpfm2lJWVFWQyWYptZDIZXr16pVICdnZ2qF27NtauXQsAUCgUsLS0xOjRozF16tQk+8jlcjRq1AgDBw6Ep6cnQkJCcPjw4TS9Hm9LUV4w6/Aj7LjxJlH8z9410aZq2q64CiGwYsV1TJ16DgkJCgBAgQJ62LatE2ccJqJMp5a1pfz8/P5rXonExcXh7t27mDZtmhTT0NCAg4MDrl+/nmy/+fPnw8zMDL/88gs8PT0zPC+inOxjeIxSYVO6cD7MaV8ZDcuapvoHyndfvkShf/8jOHbsmRSzt7fEnj1dUaJE2ua/ISLKKln6aMPnz58hl8tRpIjyJfIiRYoku7r4lStXsHnzZnh7e6fpNWJjYxEb+2OMQFhYWLrzJcruhBCos+ictH1kZH1UtzRR6RjXrr1Fjx778fbtj38rU6bUx4IFTaGtrZlRqRIRqU2GzVCcGcLDw9G3b1/8/ffficb6JGfJkiUwNjaWviwtLdWcJVHWWHjMB9bTTkjbNSxNVC5sli+/hkaNtkqFjampAU6e7I2lSx1Y2BBRjpGlV25MTU2hqamJoKAgpXhQUBCKFi2aqP3Lly/h5+eH9u3bSzGF4ttYAC0tLfj6+qJ06dJKfaZNm4YJEyZI22FhYSxwKFeJS1Cg3MyTieKHR6o+TYNCISCXfxuG16hRSeze3QUWFhybRkQ5S5YWNzo6OqhVqxbOnTsnPc6tUChw7tw5jBo1KlH7ChUq4OFD5VWLZ86cifDwcKxevTrJokVXVxe6urpqyZ8oqx24+w4T991XirkNqYu6pdK3ptPEifbw9PRH9epFMGdOE2hp5aiLu0REALK4uAGACRMmwMXFBba2tqhTpw5WrVqFyMhIDBgwAADQr18/WFhYYMmSJdDT00OVKlWU+puYmABAojhRbhYTL0eFWR6J4i8Xt4GmRtoGDcvlCly9+haNGpWUYhoaMhw+3AMaaTwGEVF2lK7i5uXLl9i6dStevnyJ1atXw8zMDCdPnkSJEiVQubJqC+Y5Ozvj06dPmD17NgIDA1GjRg14eHhIg4z9/f2hocG/Hom+834bgk7rrirF2lYthnkdK6e5sAkMjECfPgdx/vxrnD3bD82aWUv7WNgQUU6X5nluvrt06RJat26N+vXr4/Lly3jy5AlKlSqFpUuX4s6dO9i/f7+6cs0QnOeGcrL3IdGov/S8tG2gowmf+a1UOsa5c6/Qu/dBBAVFAgAsLAzx4sUY6Oll+YVcIqJkqWX5he+mTp2KhQsX4syZM9DR0ZHizZo1w40bN1LoSUTpJVcINPzfeaXC5tdW5VUqbORyBWbPvoAWLXZIhU2xYvmxc2cXFjZElKuo/Bvt4cOH2L17d6K4mZkZPn/+nCFJEdEPQWExsFt8Tik2uKE1RjQpk+ZjfPgQjl69DuDSpR+T+7VsWRo7dnROtBAmEVFOp3JxY2JigoCAAFhbWyvFvby8YGFhkWGJEdG3SfnarFaehfvRPEfk1037P91Tp16gT59D+Pw5CgCgqSnDggVNMWVKA46vIaJcSeXbUj169MCUKVMQGBgImUwGhUKBq1evYtKkSejXr586ciTKsxovu4gvkXHStt/StioVNn/+eRutWu2SChsLC0NcvNgf06Y1ZGFDRLmWysXN4sWLUaFCBVhaWiIiIgKVKlVCo0aNYG9vj5kzZ6ojR6I8qf/WW/APjpK2PX9tqvIxmjWzRr582gCAtm3Lwtt7GBo0KJFhORIRZUcqPy31nb+/Px49eoSIiAjY2NigbNmyGZ2bWvBpKcoJVp55htXnnkvbPvMdYaCTvkG/u3Y9QEBABCZMqMerNUSUY6ny+a1ycXPlyhU0aNDgPyWYlVjcUHYmhFBaHwoAbkxrjqLGeqn2jY+XY/Xqmxg5sjb09bXVlSIRUZZQ66PgzZo1g7W1NaZPnw4fH590J0lEyhLkikSFza5BdmkqbPz8QtCw4VZMnnwGEyeeVleKREQ5gsrFzYcPHzBx4kRcunQJVapUQY0aNbBs2TK8e/dOHfkR5QkJcgXKzFBe/PLl4jaoX8Y01b6HDz+Fjc1fuHnzPQBg06Z7ePkyWC15EhHlBCoXN6amphg1ahSuXr2Kly9fonv37ti2bRusrKzQrFkzdeRIlKsJIRIVNl6zWqS6lEJsbALGjfNA587uCAmJAQBYW5vg6tWBKF26oNryJSLK7tI9oPg7uVyOkydPYtasWXjw4AHkcnlG5aYWHHND2Y3N/NP4GhUvbd+Z6QDT/CmvZP/yZTCcnffj7t0AKdatWyVs2tQexmm4jUVElNOodczNd1evXsWIESNQrFgx9OrVC1WqVMHx48fTeziiPOeC70dYTT2uVNj4LW2bamGzb99j1Ky5USpsdHQ0sW5dG+zd242FDRER0jFD8bRp0+Dm5oYPHz6gRYsWWL16NTp27AgDAwN15EeU6yTIFagwywMJCuWLpt6zW6Ta99ixZ3By+rE4bZkyBbF3bzfY2BTL8DyJiHIqlYuby5cvY/LkyXBycoKpaeqDHYnoB4Ui8fia/vZWmNm2IrQ0U7+Q2rp1GTRuXBKXLr1Bz55V8Ndf7WBomPKVHiKivOY/j7nJaTjmhrKS1VTlW7cP57aEoZ5qc9J8+BAOD48XGDCgBmQyTspHRHmDKp/fabpyc/ToUbRu3Rra2to4evRoim07dOiQ9kyJ8gAhBB6+D8XQHXeV4s8WtoaOVvJXa6Ki4jFhwikMHGiDOnV+LEprbm6IgQNt1JYvEVFOl6YrNxoaGggMDISZmRk0NJL/ZSyTyfi0FNFP1px7juVnniWKv1rcJsWlEJ48+QQnp/149OgjrKxM4OU1FCYmHCxMRHlXhl+5USgUSX5PREnzeBSIYTvvJorXKlkAbkPqpljYbNvmjREjTiDq/5+i+vgxEvfuBaBZM2u15UtElJuo/Cj49u3bERsbmygeFxeH7du3Z0hSRDnZ1quvExU2iztXhd/Stjgw3B7ayQwcjoyMQ//+h9G//xGpsKlcuTBu3x7MwoaISAUqDyjW1NREQEAAzMzMlOJfvnyBmZkZb0tRntZn001cefFZ2h7bvCzGtyiXar9Hjz7CyWkfnjz50XfgwBpYs6YNDAy4CCYRUYbflvqZECLJJzTevXsHY2NjVQ9HlGt4PApQKmx2/mKHBmVTni5BCIEtW7wwatRJxMQkAADy5dPGhg3t0KdPNbXmS0SUW6W5uLGxsYFMJoNMJkPz5s2hpfWjq1wux+vXr9GqVSu1JEmU3X2JiMWwnfekba9ZLVAgn06q/d68CcXIkScQG/vtime1akWwd283lC/POaSIiNIrzcVNp06dAADe3t5wdHRE/vz5pX06OjqwsrJC165dMzxBouzs5qsvcN54Qyk2rXWFNBU2AGBlZYIVKxwxcuQJDB1aCytXOkJfn7ehiIj+C5XH3Gzbtg3Ozs7Q08uZj6VyzA1lhNgEOdqvuYJnQRFKcYeKRbDJxTbZfkIIKBQCmj8NKhZC4MoVfzRsWFJt+RIR5XSqfH5zhmIiFYVGx6P6vNNKsQH1rTCjTcpLKISGxmDIkGOwsjLGb7+lvo4UERH9kOEDigsWLIhnz57B1NQUBQoUSHHK9+DgYNWyJcohImITUGXOqUTxOzMdUl3J++7dD3B23o+XL78CABo3tkKbNmXVkicRUV6XpuJm5cqVMDQ0lL7nejaU11zw/YgBW28rxapaGOOf0Q1S7CeEwNq1tzBp0hnExX0bNGxioge5nJNhEhGpC29LEaVirJsXjnh/kLaLGunh2tRmKc4yDABfv0bjl1+O4tChp1KsTh0LuLt3g5WVibrSJSLKlVT5/FZ5huJ79+7h4cOH0vaRI0fQqVMnTJ8+HXFxcapnS5RNRcfJYTX1uFJhM7C+NW5Mb55qYXPr1nvUrLlRqbCZMKEuPD0HsLAhIlIzlYuboUOH4tmzbwsBvnr1Cs7OzjAwMMC+ffvw66+/ZniCRFlBCIGKsz2UYkdH1cfs9pVS7bdixXXUr78Ffn4hAIACBfRw9GgPLF/uCB0dTXWlTERE/0/l4ubZs2eoUaMGAGDfvn1o3Lgxdu/eDVdXVxw4cCCj8yPKdHEJClhPOyFta2rI8HpJG1QrbpJq3/h4BdzcHiEh4duYGnt7S3h7D0P79uXVlS4REf2LysXNt3k6vv3iPnv2LNq0aQMAsLS0xOfPn1PqSpSthUbFY86RRyg386RS3Ge+Y5oH0evoaMLNrRtMTPQwZUp9XLzoghIluCwJEVFmUnltKVtbWyxcuBAODg64dOkS1q9fDwB4/fo1ihQpkuEJEqmbEAIzDz/Crpv+ifY9XdAKulrJ30pSKAQ+fYpEkSI/ZuwuVaoAnj8fDVNTA7XkS0REKVO5uFm1ahV69+6Nw4cPY8aMGShTpgwAYP/+/bC3t8/wBInUaf/dd5i0736i+IQW5TCmecrz0Hz6FIl+/Q7Dzy8Et28PRv78P5ZcYGFDRJR1MuxR8JiYGGhqakJbO3uvi8NHwem79yHRqL/0vFJsXa+aaFutWKp9L19+g549D+DDh3AAgItLdbi6dlJHmkREBDXMUJyUu3fv4smTJwCASpUqoWbNmuk9FFGmW332OVaefSZtp7WokcsVWLLkCubMuQiF4tvfBWZm+dCnTzW15UpERKpRubj5+PEjnJ2dcenSJZiYmAAAQkJC0LRpU7i5uaFw4cIZnSNRhpErBLpvuIZ7/iFSzKFikTQVNkFBEejd+yDOnXstxZo1s8bOnZ1RrJihOtIlIqJ0UPlpqdGjRyMiIgKPHz9GcHAwgoOD8ejRI4SFhWHMmDHqyJEoQ3yNjEPp6SeUCpv9w+qluIr3d+fOvUL16hukwkZDQ4Z585rg9Ok+LGyIiLIZlcfcGBsb4+zZs6hdu7ZS/NatW2jZsiVCQkIyMr8MxzE3eVN4TDyqzlVeyfvcxMYoXTh/Mj1+WLDgEubMuYjv/1KKFcuP3bu7okkTKzVkSkRESVHrmBuFQpHkoGFtbW1p/hui7GTHdT/MOvJYKfZ6SZs0z12jra0pFTYtW5bGjh2dYWaWL6PTJCKiDKLylZuOHTsiJCQEe/bsgbm5OQDg/fv36N27NwoUKIBDhw6pJdGMwis3eUdkbAIqzzmlFHOyLY7/dauu0nEUCoEOHfbA3t4SU6c2SHVdKSIiyniqfH6rXNy8ffsWHTp0wOPHj2FpaSnFqlSpgqNHj6J48eLpzzwTsLjJ/RQKgTXnXyg9DQUAm/rZwqFSyhNNJiQo4On5Bk2bWic6JosaIqKso9bbUpaWlrh37x7OnTsnPQpesWJFODg4pC9bogy09vxz/H76WaJ4Wm5DvXsXhp49D+Datbc4f74fGje2kvaxsCEiyjlUKm7c3d1x9OhRxMXFoXnz5hg9erS68iJSmd3iswgKi1WKLehUBX3sSqRa2Bw//gwuLofx5Us0AMDF5TCePRvNVbyJiHKgNBc369evx8iRI1G2bFno6+vj4MGDePnyJZYtW6bO/IjSZM2550qFzeoeNdChunmqRU18vBzTp5/D779fl2IlShjDza0bCxsiohwqzWNuKleuDCcnJ8yZMwcAsHPnTgwdOhSRkZFqTTCjccxN7jPr8CPsuPFG2n62sDV0tFKfwunNmxD06HEAN268k2IdO5bHli0dUbCgvlpyJSKi9FHLgGJ9fX08efIEVlZWAL49Eq6vrw8/Pz8UK5b67K7ZBYub3CM0Kh7V5yvPXXNohD1sShRIte/hw08xYMARhITEAAC0tTWwbFkLjBljl+ZHxImIKPOoZUBxbGws8uX7MbeHhoYGdHR0EB0dnf5MidIhODIONRecSRQ/M74RyhZJfbbgFSuuY+LEH0WRtbUJ3N27oXZtiwzNk4iIsoZKA4pnzZoFAwMDaTsuLg6LFi2CsbGxFFuxYkXGZUf0L1FxCYkKmwIG2vCc0gz5ddP249yqVRnMnHke0dEJ6Nq1IjZt6gATEz11pEtERFkgzcVNo0aN4OvrqxSzt7fHq1evpG1ezid18ngUgGE77ynFVJlp+LtKlQpjw4Z2CA+PxYgRtflzS0SUy6g8iV9OxzE3OdOxBx8wareXUsxvadtU+8XEJGDVqhsYP74udNN4ZYeIiLIftU7iR5TZ7vgFKxU2W/vXRtMKZqn2e/78C5yd98PLKxDv34dhzZo26kyTiIiyidSflyXKIgqFQMuVl9Btw485aOZ1qJymwmbPnoeoWXMjvLwCAQCbNnnB3z9UbbkSEVH2wSs3lC0Fhsag7pJzSrHxDuXgYm+VYr/o6HiMHeuBv//+MTanfPlC2Lu3O0qUME6hJxER5RYsbijbufU6GE5/XVeK3Z3pgEL5dVPs9/TpZzg57cPDhx+lWN++1fDnn22RP7+OWnIlIqLsh8UNZSsbLr3E0pNPpe0KRQ3hMa5Rqv22b7+P4cOPIyoqHgCgr6+FP/9si/79a6grVSIiyqbSNebG09MTffr0Qb169fD+/XsAwI4dO3DlypV0JbFu3TpYWVlBT08PdnZ2uHXrVrJt//77bzRs2BAFChRAgQIF4ODgkGJ7yll+Lmz+6GmTpsLmwAEfuLgclgqbypUL486dISxsiIjyKJWLmwMHDsDR0RH6+vrw8vJCbOy3xQpDQ0OxePFilRNwd3fHhAkTMGfOHNy7dw/Vq1eHo6MjPn78mGT7ixcvomfPnrhw4QKuX78OS0tLtGzZUiqyKGe69vIzrKYel7Z3/FIHHaqbp6lvx44V0KBBCQDAL7/Y4NatwahUqbBa8iQiouxP5XlubGxsMH78ePTr1w+Ghoa4f/8+SpUqBS8vL7Ru3RqBgYEqJWBnZ4fatWtj7dq1AL6tWWVpaYnRo0dj6tSpqfaXy+UoUKAA1q5di379+qXanvPcZC9yhUDjZRfw7qvyMh5pmcPmZ+/ehcHT8w169qyakekREVE2ocrnt8pXbnx9fdGoUeJbBcbGxggJCVHpWHFxcbh79y4cHBx+JKShAQcHB1y/fj2Fnj9ERUUhPj4eBQsWVOm1KeslyBWwXXhGqbBZ09MmxcImIiIOAwcewZ07H5TixYsbsbAhIiIA6RhQXLRoUbx48UJaHfy7K1euoFSpUiod6/Pnz5DL5ShSpIhSvEiRInj69GkyvZRNmTIF5ubmSgXSz2JjY6VbZ8C3yo+yXv2l5/E+RPlqzb1ZLVAwX/JPNd2/Hwgnp/149uwLLl16g3v3hsDYmGtCERGRMpWv3AwePBhjx47FzZs3IZPJ8OHDB+zatQuTJk3C8OHD1ZFjspYuXQo3NzccOnQIenpJf8gtWbIExsbG0pelpWWm5kiJOa68nKiwebqgVbKFjRACf/11B3Z2m/Ds2RcAwKdPkXjwIEjtuRIRUc6j8pWbqVOnQqFQoHnz5oiKikKjRo2gq6uLSZMmYfTo0Sody9TUFJqamggKUv6QCgoKQtGiRVPs+/vvv2Pp0qU4e/YsqlWrlmy7adOmYcKECdJ2WFgYC5wsIoSAzYIzCPn/p5oAwGtWCxRI4WpNWFgshgz5B+7uj6VYzZrF4O7eDWXK8FYkEREllu6FM+Pi4vDixQtERESgUqVKyJ8/f7oSsLOzQ506dbBmzRoA3wYUlyhRAqNGjUp2QPH//vc/LFq0CKdOnULdunVVej0OKM46Pz8NBQA3pzdHEaPkbyvduxcAJ6d9ePnyqxQbPboOli1rwUUwiYjymExZOFNHRweVKlVKb3fJhAkT4OLiAltbW9SpUwerVq1CZGQkBgwYAADo168fLCwssGTJEgDAb7/9htmzZ2P37t2wsrKSns7Knz9/ugssUr++m28qbT9d0Ap62ppJthVCYN2625g48TTi4uQAAGNjXWzZ0hFdulRUe65ERJSzqVzcNG3aFDKZLNn958+fV+l4zs7O+PTpE2bPno3AwEDUqFEDHh4e0iBjf39/aGj8GBq0fv16xMXFoVu3bkrHmTNnDubOnavSa1PmcL36Gp7PP0vbr5e0SfFn6MWLYEyYcArx8QoAQO3a5nB37wZr6wJqz5WIiHI+lW9LjR8/Xmk7Pj4e3t7eePToEVxcXLB69eoMTTCj8bZU5uq07iq834ZI21emNEXxAgap9vvjj5sYO9YD48fXxdKlDtDRSfoqDxER5Q1qvS21cuXKJONz585FRESEqoejXCqpxS//7F0zycJGCAGFQkBT88cVutGj66BOHQvUrVtc7bkSEVHuku4Bxf/24sUL1KlTB8HBwRlxOLXhlRv1e/UpAs2WX1KK3ZrRHGaGiQcPBwdHY8CAI6hcuTAWL26eWSkSEVEOkykDiv/t+vXryc41Q3lDXIICMw49xL6776RY43KFsbFfLehqJb6tdO3aW/TosR9v34bh6FFfNG5cEo6OZTIzZSIiyoVULm66dOmitC2EQEBAAO7cuYNZs2ZlWGKUs+y/+w6T9t1XirnUK4l5HaskaqtQCPz++zVMn34Ocvm3C4eFCumnOMiYiIgorVQuboyNjZW2NTQ0UL58ecyfPx8tW7bMsMQo5/ANDE9U2Gx2sUXzikUStf30KRIuLodx8uQLKdawYQns3t0VxYvzNiEREf13KhU3crkcAwYMQNWqVVGgAB/LzetiE+To8uc1PP7wY72uHb/UQcOyhZNs7+n5Bj16HMCHD+EAAJkMmD69IebObQItLZVXAiEiIkqSSsWNpqYmWrZsiSdPnrC4yePCY+JRde5ppdjQxqWSLGwUCoElSzwxe/ZFKBTfbkOZmeXDzp2d0aJF6UzJl4iI8g6Vb0tVqVIFr169grW1tTryoWxOCIHFJ57gb8/XSvGU5q+Jj5fj4MGnUmHTtKkVdu3qgmLFDNWeLxER5T0q3wtYuHAhJk2ahGPHjiEgIABhYWFKX5R7fQqPhfW0E0qFTRULI/gtbZvixHy6ulpwd+8GExM9zJ3bGGfO9GVhQ0REapPmeW7mz5+PiRMnwtDwx4fSz0+3CCEgk8kgl8szPssMxHlu0icyNgGV55xSiv3R0wYdqpsnaiuXK/DxY2SiAiY4OBoFC+qrNU8iIsqdVPn8TnNxo6mpiYCAADx58iTFdo0bN057plmAxY3qHr0PRbs1V6TtVpWLYkPfWkm2DQgIR+/eBxEUFIlbtwYhXz6dzEqTiIhyMbVM4ve9BsruxQtlHO+3Iei07qpSzFBPK9nC5syZl+jT5xA+fowEAIwd64FNmzqoPU8iIqKfqTTmhpOs5S3/LmxGNi2Nh3MdE7VLSFBg5szzcHTcKRU2FhaGcHGpnil5EhER/Uylp6XKlSuXaoGT3deWotS9D4lG/aXnpe0RTUpjsmP5JP/fv3sXhl69DsDT01+KtW5dBtu3d4apaeqrfxMREWU0lYqbefPmJZqhmHKPuAQF2vzhiRcflVd3/7VVhSTbnzjxHP36HcKXL9EAAE1NGZYsaY6JE+2hocGrfERElDVUKm569OgBMzMzdeVCWcj/SxQaLbugFDPQ0cS9WS2SbD99+jksWfJjkHGJEsZwc+uKevUs1ZonERFRatJc3HC8Te4VEhWnVNhoashwd6YDTAySf9IpXz5t6fsOHcpj69aOfMybiIiyBZWflqLcp8b8M9L3pQrnw/mJTVLtM21aQ1y//g4ODqUwdqwdi18iIso20lzcKBQKdeZBWSQ0Ol763r50IeweXDdRm7g4OTw936B581JSTENDhn/+6cmihoiIsh0uxZyHCSFQfd6PxS93DbJL1Ob1669o0GALHB134soVf6V9LGyIiCg7YnGTR4VExcF62gml2L+LlYMHn8DG5i/cvv0BcrlA//6HkZDAK3hERJS9qbwqOOV80XFypXE2APB6SRvp+5iYBEyefBpr196WYmXKFMTevd2gpcV6mIiIsjcWN3lQ1/XXpO9tSxbAvmH1pKs2L14Ew8lpH7y8AqU2PXpUwV9/tYORkW6m50pERKQqFjd5yOeIWNguPKsU2z/cXvre3f0RBg/+B+HhcQAAXV1N/PFHawweXJPja4iIKMdgcZNH7L3zFr/uf6AUuz6tmfT94sWemDHjx5IL5csXwt693VGtWpFMy5GIiCgjsLjJxWLi5XD66zoevAtVihc21MX1qc2gpflj/EyHDuWxcOFlREcnoE+fali/vi3y509+Ej8iIqLsisVNLhWbIEeFWR6J4rsH2cG+jGmieJUqZli/vi3kcoEBA2rwNhQREeVYLG5yqfIzlQub3YPsUMe6ILQ0NRAZGYeVK2/g11/rQ0dHU2rj4lIjk7MkIiLKeCxuchmFQqDUdOX5a/yWtpW+f/z4I5yc9sPH5xO+fInCypWtMjtFIiIiteKkJbmIEIkLG9+FraR9W7d6oXbtv+Hj8wkAsGmTFz58CM/0PImIiNSJV25ykX/POPx6SRvIZDJERMRh+PDj2Lnzx9NSVauaYe/e7jA3N8zsNImIiNSKxU0uMdD1ttL298LmwYMgODntg6/vF2nf0KG1sHKlI/T1tTM7TSIiIrVjcZMLrLvwAueffpS2ny9qDQDYuPEuxo71QExMAgDA0FAHGze2R48eVbIkTyIioszA4iYHi0tQoNzMk0qxG9OaQ1tTA3v2PMTQocekuI1NUbi7d0PZsoUyO00iIqJMxQHFOZT/l6hEhc32gXVQ1FgPANCtWyXY21sCAEaOrI1r135hYUNERHkCr9zkUI2WXZC+19HUwLP/vxX1nba2Jvbs6Yo7dz6gS5eKmZ0eERFRluGVmxyow9or0vcNy5ri1uSm6NXrALy8ApTalShhzMKGiIjyHF65yWHWnHuutFbUoArFULPmX3j9OgS3b3/A3btDYGSkm4UZEhERZS0WNzlEXIICLVdegt+XKADfJuUbUbggWjTdhvh4BQDgy5coPHnyCXZ2xbMyVSIioizF4iYHEEIoDR6WRyeg0vMITD15V4rVrVscbm5dUbKkSRZkSERElH2wuMnmDtx9h4n77kvbse8joH35A077h0mxX3+1x8KFzaCtrZnUIYiIiPIUFjfZWI35pxESFQ/g29WbsFtBiLjyAQkJ325DFSqkj+3bO6NNm7JZmSYREVG2wuImm4mXK2C78CxCo+OV4vMalcWQlV5SYdOgQQns2dMVxYsbZUWaRERE2RaLm2ykxYpLeP4xIlH8+aLW0NbUwNffHDBx4mlMn94Qc+c2gZYWn+QnIiL6NxY32cSBu++UChshBPYPq4fa1j9mFR43ri4aNiwJW1vzrEiRiIgoR2Bxkw2sPvscK88+k7Yvj26IQQOO4J+tD1B7flMpLpPJWNgQ5QBCCCQkJEAul2d1KkQ5ira2NjQ1//vDMSxustiSE0/w1+VX0nbP4gVhZ/s3AgMjcPbsKzRqVBIODqWyMEMiUkVcXBwCAgIQFRWV1akQ5TgymQzFixdH/vz5/9NxWNxkIc/nn6TCRigEWsRq4H9jz0ChEACAIkXyQ1ub42qIcgqFQoHXr19DU1MT5ubm0NHRgUwmy+q0iHIEIQQ+ffqEd+/eoWzZsv/pCg6Lmyyy4swz/HHuOQAgISIOxe4FY/P1d9L+Fi1KYceOzihS5L9Vr0SUeeLi4qBQKGBpaQkDA4OsTocoxylcuDD8/PwQHx/P4ianeR8SLRU20a9DEXv2He4ERwMANDRkmD+/CaZNawgNDf7FR5QTaWjwiitRemTUlU4WN5nM41EAhu28B6EQCLnyAeE3AiC+3YWCubkh9uzpikaNSmZtkkRERDkYi5tM9OpTBIbtvPdtQyGg8yFSKmxaty6Dbds6oXDhfFmXIBERUS7Aa6eZRKEQaLb8krT9R5+auHq6HwoU0MP//ueAY8d6sbAhomxPJpPh8OHDan+dixcvQiaTISQkRIodPnwYZcqUgaamJsaNGwdXV1eYmJioLQdfX18ULVoU4eHhanuNvCQuLg5WVla4c+eO2l+LxU0mePExHNZTjiEhPA4AUCifDjrWsEC5coXw6tVYTJ5cn+NriCjLBQYGYvTo0ShVqhR0dXVhaWmJ9u3b49y5c5mei729PQICAmBsbCzFhg4dim7duuHt27dYsGABnJ2d8ezZsxSO8t9MmzYNo0ePhqGhYaJ9FSpUgK6uLgIDAxPts7KywqpVqxLF586dixo1aijFsuqc79u3DxUqVICenh6qVq2KEydOpNj+e7H576+f3/+SJUtQu3ZtGBoawszMDJ06dYKvr6+0X0dHB5MmTcKUKVPU9r6+Y3GjRkIITNp3H03mnkHQbl983PccingF7sx0kNqYmOhlYYZERN/4+fmhVq1aOH/+PJYtW4aHDx/Cw8MDTZs2xciRIzM9Hx0dHRQtWlQaYBoREYGPHz/C0dER5ubmMDQ0hL6+PszMzP7T68THxycZ9/f3x7Fjx9C/f/9E+65cuYLo6Gh069YN27ZtS/drZ9U5v3btGnr27IlffvkFXl5e6NSpEzp16oRHjx6l2tfX1xcBAQHS18/n/9KlSxg5ciRu3LiBM2fOID4+Hi1btkRkZKTUpnfv3rhy5QoeP36slvcmEXlMaGioACBCQ0PV/lqOKy+Jwl12Cg29BQKYK4C5Ytiwf9T+ukSUNaKjo4WPj4+Ijo6WYgqFQkTGxmfJl0KhSHPurVu3FhYWFiIiIiLRvq9fv0rfAxCHDh2Stn/99VdRtmxZoa+vL6ytrcXMmTNFXFyctN/b21s0adJE5M+fXxgaGoqaNWuK27dvCyGE8PPzE+3atRMmJibCwMBAVKpUSRw/flwIIcSFCxcEAPH161fp+5+/Lly4ILZu3SqMjY2Vcj18+LCwsbERurq6wtraWsydO1fEx8cr5f/nn3+K9u3bCwMDAzFnzpwkz8eyZcuEra1tkvv69+8vpk6dKk6ePCnKlSuXaH/JkiXFypUrE8XnzJkjqlevLm2n9ZxnNCcnJ9G2bVulmJ2dnRg6dGiyfX7+/5FWHz9+FADEpUuXlOJNmzYVM2fOTLJPUv+GvlPl85sDitVACIEyU4/j0/l3CL8TJMWtrEwwYIBNFmZGRJktOl6OSrNPZclr+8x3hIFO6r/mg4OD4eHhgUWLFiFfvsRj/1Ia12JoaAhXV1eYm5vj4cOHGDx4MAwNDfHrr78C+PaXuo2NDdavXw9NTU14e3tDW1sbADBy5EjExcXh8uXLyJcvH3x8fJKcmdbe3h6+vr4oX748Dhw4AHt7exQsWBB+fn5K7Tw9PdGvXz/88ccfaNiwIV6+fIkhQ4YAAObMmSO1mzt3LpYuXYpVq1ZBSyvp8+Pp6QlbW9tE8fDwcOzbtw83b95EhQoVEBoaCk9PTzRs2DDZc5SU/3LOd+3ahaFDh6Z4/JMnTyab0/Xr1zFhwgSlmKOjY5rGUtWoUQOxsbGoUqUK5s6di/r16yfbNjQ0FABQsGBBpXidOnXg6emZ6mv9F9miuFm3bh2WLVuGwMBAVK9eHWvWrEGdOnWSbb9v3z7MmjULfn5+KFu2LH777Te0adMmEzNOnlwhUGLEIXw++gpxAT8uxXXpUhGbN3fgbSgiynZevHgBIQQqVKigct+ZM2dK31tZWWHSpElwc3OTiht/f39MnjxZOnbZsmWl9v7+/ujatSuqVq0KAChVKumlZnR0dKTbHwULFkTRokWTbDdv3jxMnToVLi4u0vEWLFiAX3/9Vam46dWrFwYMGJDi+3rz5k2SxY2bmxvKli2LypUrAwB69OiBzZs3q1zc/Jdz3qFDB9jZ2aXYxsLCItl9gYGBKFKkiFKsSJEiSY4f+q5YsWLYsGEDbG1tERsbi02bNqFJkya4efMmatasmai9QqHAuHHjUL9+fVSpUkVpn7m5Od68eZNi/v9Vlhc37u7umDBhAjZs2AA7OzusWrUKjo6O8PX1TfJe6vd7hUuWLEG7du2we/dudOrUCffu3Ut0ArNCq3HHEODqAxH7bcE8HR1NLF/eEiNH1uY07ER5kL62JnzmO2bZa6eF+D4nRTq4u7vjjz/+wMuXLxEREYGEhAQYGRlJ+ydMmIBBgwZhx44dcHBwQPfu3VG6dGkAwJgxYzB8+HCcPn0aDg4O6Nq1K6pVq5buXO7fv4+rV69i0aJFUkwulyMmJgZRUVHSrNFJFS3/Fh0dDT29xH+MbtmyBX369JG2+/Tpg8aNG2PNmjVJDjxOzn8554aGhiq9VkYoX748ypcvL23b29vj5cuXWLlyJXbs2JGo/ciRI/Ho0SNcuXIl0T59fX21r72W5QOKV6xYgcGDB2PAgAGoVKkSNmzYAAMDA2zZsiXJ9qtXr0arVq0wefJkVKxYEQsWLEDNmjWxdu3aTM5cmRACBe3+wtk196TCplTpArh+/ReMGlWHhQ1RHiWTyWCgo5UlX2n9vVO2bFnIZDI8ffpUpfd2/fp19O7dG23atMGxY8fg5eWFGTNmIC4uTmozd+5cPH78GG3btsX58+dRqVIlHDp0CAAwaNAgvHr1Cn379sXDhw9ha2uLNWvWqJTDzyIiIjBv3jx4e3tLXw8fPsTz58+VCpWkbgP9m6mpKb5+/aoU8/HxwY0bN/Drr79CS0sLWlpaqFu3LqKiouDm5ia1MzIykm7J/CwkJER6+iu95xz4dlsqf/78KX6ldNunaNGiCAoKUooFBQUle0UsOXXq1MGLFy8SxUeNGoVjx47hwoULKF68eKL9wcHBKFy4sEqvpaosLW7i4uJw9+5dODj8eHpIQ0MDDg4OuH79epJ9rl+/rtQe+HavMLn2sbGxCAsLU/pSh/ch0VD8tMhl6w7l4HVvKGrWLKaW1yMiyigFCxaEo6Mj1q1bp/Rky3c/zzXzs2vXrqFkyZKYMWMGbG1tUbZs2SRvN5QrVw7jx4/H6dOn0aVLF2zdulXaZ2lpiWHDhuHgwYOYOHEi/v7773S/j5o1a8LX1xdlypRJ9KXqkhg2Njbw8fFRim3evBmNGjXC/fv3lQqoCRMmYPPmzVK78uXL4+7du4mOee/ePZQrVw5A+s858O221M+vn9RXSlen6tWrl+hR8zNnzqBevXrJ9kmKt7c3ihX78RknhMCoUaNw6NAhnD9/HtbW1kn2e/ToEWxs1Dv+NEtvS33+/BlyuTzJe3/JVbOq3itcsmQJ5s2blzEJp+BTeCwKNTBHXEAklk9qgGHDavFqDRHlGOvWrUP9+vVRp04dzJ8/H9WqVUNCQgLOnDmD9evX48mTJ4n6lC1bFv7+/nBzc0Pt2rVx/Phx6aoM8O3WzuTJk9GtWzdYW1vj3bt3uH37Nrp27QoAGDduHFq3bo1y5crh69evuHDhAipWrJju9zB79my0a9cOJUqUQLdu3aChoYH79+/j0aNHWLhwoUrHcnR0xKBBgyCXy6GpqYn4+Hjs2LED8+fPTzQEYtCgQVixYgUeP36MypUrY/z48WjYsCEWLVqELl26QC6XY8+ePbh+/Tr+/PNPqV96zjnw329LjR07Fo0bN8by5cvRtm1buLm54c6dO9i4caPUZtq0aXj//j22b98OAFi1ahWsra1RuXJlxMTEYNOmTTh//jxOnz4t9Rk5ciR2796NI0eOwNDQUPpcNjY2hr6+vtTO09MTCxYsSHf+aZLmZ7rU4P379wKAuHbtmlJ88uTJok6dOkn20dbWFrt371aKrVu3TpiZmSXZPiYmRoSGhkpfb9++Veuj4Ko8eklEuUtKj7HmBB8+fBAjR44UJUuWFDo6OsLCwkJ06NBBXLhwQWqDfz0KPnnyZFGoUCGRP39+4ezsLFauXCk9nh0bGyt69OghLC0thY6OjjA3NxejRo2Szs+oUaNE6dKlha6urihcuLDo27ev+Pz5sxAi8aPHX79+lR4B/y6pR8E9PDyEvb290NfXF0ZGRqJOnTpi48aNyeafnPj4eGFubi48PDyEEELs379faGhoiMDAwCTbV6xYUYwfP17aPnXqlKhfv74oUKCAKFSokGjSpEmiR6KFSNs5V4e9e/eKcuXKCR0dHVG5cmXpEfzvXFxcROPGjaXt3377TZQuXVro6emJggULiiZNmojz588r9cG/Htf//rV161apzbVr14SJiYmIiopKMq+MehRc9v8JZYm4uDgYGBhg//796NSpkxR3cXFBSEgIjhw5kqhPiRIlMGHCBIwbN06KzZkzB4cPH8b9+/dTfc2wsDAYGxsjNDRUadAbEdF/FRMTg9evX8Pa2jrJwaiUs6xbtw5Hjx7FqVNZ8yh/buTs7Izq1atj+vTpSe5P6d+QKp/fWTrmRkdHB7Vq1VK696dQKHDu3Llk7/1l1L1CIiKilAwdOhSNGjXi2lIZJC4uDlWrVsX48ePV/lpZ/ij4hAkT4OLiAltbW9SpUwerVq1CZGSkNAdBv379YGFhgSVLlgBI271CIiKi/0pLSwszZszI6jRyDR0dHaV5kdQpy4sbZ2dnfPr0CbNnz0ZgYCBq1KgBDw8PadCwv7+/0ih3e3t77N69GzNnzsT06dNRtmxZHD58OFvMcUNERERZL0vH3GQFjrkhInXhmBui/yZXjLkhIsqN8tjfjEQZJqP+7bC4ISLKIN8XhFT31PJEudX32a01NdO2dEhysnzMDRFRbqGpqQkTExN8/PgRAGBgYMDJPInSSKFQ4NOnTzAwMEh2tfa0YnFDRJSBvq/P873AIaK009DQQIkSJf7zHwUsboiIMpBMJkOxYsVgZmaG+Pj4rE6HKEfR0dFReR2wpLC4ISJSA01Nzf88boCI0ocDiomIiChXYXFDREREuQqLGyIiIspV8tyYm+8TBIWFhWVxJkRERJRW3z+30zLRX54rbr6v7mppaZnFmRAREZGqwsPDYWxsnGKbPLe2lEKhwIcPH2BoaJjhk2uFhYXB0tISb9++5bpVasTznDl4njMHz3Pm4bnOHOo6z0IIhIeHw9zcPNXHxfPclRsNDQ0UL15cra9hZGTEfziZgOc5c/A8Zw6e58zDc5051HGeU7ti8x0HFBMREVGuwuKGiIiIchUWNxlIV1cXc+bMga6ublankqvxPGcOnufMwfOceXiuM0d2OM95bkAxERER5W68ckNERES5CosbIiIiylVY3BAREVGuwuKGiIiIchUWNypat24drKysoKenBzs7O9y6dSvF9vv27UOFChWgp6eHqlWr4sSJE5mUac6mynn++++/0bBhQxQoUAAFChSAg4NDqv9f6BtVf56/c3Nzg0wmQ6dOndSbYC6h6nkOCQnByJEjUaxYMejq6qJcuXL83ZEGqp7nVatWoXz58tDX14elpSXGjx+PmJiYTMo2Z7p8+TLat28Pc3NzyGQyHD58ONU+Fy9eRM2aNaGrq4syZcrA1dVV7XlCUJq5ubkJHR0dsWXLFvH48WMxePBgYWJiIoKCgpJsf/XqVaGpqSn+97//CR8fHzFz5kyhra0tHj58mMmZ5yyqnudevXqJdevWCS8vL/HkyRPRv39/YWxsLN69e5fJmecsqp7n716/fi0sLCxEw4YNRceOHTMn2RxM1fMcGxsrbG1tRZs2bcSVK1fE69evxcWLF4W3t3cmZ56zqHqed+3aJXR1dcWuXbvE69evxalTp0SxYsXE+PHjMznznOXEiRNixowZ4uDBgwKAOHToUIrtX716JQwMDMSECROEj4+PWLNmjdDU1BQeHh5qzZPFjQrq1KkjRo4cKW3L5XJhbm4ulixZkmR7Jycn0bZtW6WYnZ2dGDp0qFrzzOlUPc//lpCQIAwNDcW2bdvUlWKukJ7znJCQIOzt7cWmTZuEi4sLi5s0UPU8r1+/XpQqVUrExcVlVoq5gqrneeTIkaJZs2ZKsQkTJoj69eurNc/cJC3Fza+//ioqV66sFHN2dhaOjo5qzEwI3pZKo7i4ONy9excODg5STENDAw4ODrh+/XqSfa5fv67UHgAcHR2TbU/pO8//FhUVhfj4eBQsWFBdaeZ46T3P8+fPh5mZGX755ZfMSDPHS895Pnr0KOrVq4eRI0eiSJEiqFKlChYvXgy5XJ5Zaec46TnP9vb2uHv3rnTr6tWrVzhx4gTatGmTKTnnFVn1OZjnFs5Mr8+fP0Mul6NIkSJK8SJFiuDp06dJ9gkMDEyyfWBgoNryzOnSc57/bcqUKTA3N0/0D4p+SM95vnLlCjZv3gxvb+9MyDB3SM95fvXqFc6fP4/evXvjxIkTePHiBUaMGIH4+HjMmTMnM9LOcdJznnv16oXPnz+jQYMGEEIgISEBw4YNw/Tp0zMj5Twjuc/BsLAwREdHQ19fXy2vyys3lKssXboUbm5uOHToEPT09LI6nVwjPDwcffv2xd9//w1TU9OsTidXUygUMDMzw8aNG1GrVi04OztjxowZ2LBhQ1anlqtcvHgRixcvxp9//ol79+7h4MGDOH78OBYsWJDVqVEG4JWbNDI1NYWmpiaCgoKU4kFBQShatGiSfYoWLapSe0rfef7u999/x9KlS3H27FlUq1ZNnWnmeKqe55cvX8LPzw/t27eXYgqFAgCgpaUFX19flC5dWr1J50Dp+XkuVqwYtLW1oampKcUqVqyIwMBAxMXFQUdHR60550TpOc+zZs1C3759MWjQIABA1apVERkZiSFDhmDGjBnQ0ODf/hkhuc9BIyMjtV21AXjlJs10dHRQq1YtnDt3ToopFAqcO3cO9erVS7JPvXr1lNoDwJkzZ5JtT+k7zwDwv//9DwsWLICHhwdsbW0zI9UcTdXzXKFCBTx8+BDe3t7SV4cOHdC0aVN4e3vD0tIyM9PPMdLz81y/fn28ePFCKh4B4NmzZyhWrBgLm2Sk5zxHRUUlKmC+F5SCSy5mmCz7HFTrcOVcxs3NTejq6gpXV1fh4+MjhgwZIkxMTERgYKAQQoi+ffuKqVOnSu2vXr0qtLS0xO+//y6ePHki5syZw0fB00DV87x06VKho6Mj9u/fLwICAqSv8PDwrHoLOYKq5/nf+LRU2qh6nv39/YWhoaEYNWqU8PX1FceOHRNmZmZi4cKFWfUWcgRVz/OcOXOEoaGh2LNnj3j16pU4ffq0KF26tHBycsqqt5AjhIeHCy8vL+Hl5SUAiBUrVggvLy/x5s0bIYQQU6dOFX379pXaf38UfPLkyeLJkydi3bp1fBQ8O1qzZo0oUaKE0NHREXXq1BE3btyQ9jVu3Fi4uLgotd+7d68oV66c0NHREZUrVxbHjx/P5IxzJlXOc8mSJQWARF9z5szJ/MRzGFV/nn/G4ibtVD3P165dE3Z2dkJXV1eUKlVKLFq0SCQkJGRy1jmPKuc5Pj5ezJ07V5QuXVro6ekJS0tLMWLECPH169fMTzwHuXDhQpK/b7+fWxcXF9G4ceNEfWrUqCF0dHREqVKlxNatW9Wep0wIXn8jIiKi3INjboiIiChXYXFDREREuQqLGyIiIspVWNwQERFRrsLihoiIiHIVFjdERESUq7C4ISIiolyFxQ0RERHlKixuiJLg6uoKExOTrE4j3WQyGQ4fPpxim/79+6NTp06Zkk92M2vWLAwZMiRTXuvixYuQyWQICQlJsZ2VlRVWrVql1lxUfY2M+neQlp9HVfn4+KB48eKIjIzM0ONS7sDihnKt/v37QyaTJfp68eJFVqcGV1dXKR8NDQ0UL14cAwYMwMePHzPk+AEBAWjdujUAwM/PDzKZDN7e3kptVq9eDVdX1wx5veTMnTtXep+ampqwtLTEkCFDEBwcrNJxMrIQCwwMxOrVqzFjxgyl43/PU0dHB2XKlMH8+fORkJDwn1/P3t4eAQEBMDY2BpB8wXD79u1MK7hygkWLFsHe3h4GBgZJnq9KlSqhbt26WLFiReYnR9keixvK1Vq1aoWAgAClL2tr66xOCwBgZGSEgIAAvHv3Dn///TdOnjyJvn37ZsixixYtCl1d3RTbGBsbZ8rVqcqVKyMgIAD+/v7YunUrPDw8MHz4cLW/bnI2bdoEe3t7lCxZUin+/Wfl+fPnmDhxIubOnYtly5b959fT0dFB0aJFIZPJUmxXuHBhGBgY/OfXyy3i4uLQvXv3FH9WBgwYgPXr12dIEUq5C4sbytV0dXVRtGhRpS9NTU2sWLECVatWRb58+WBpaYkRI0YgIiIi2ePcv38fTZs2haGhIYyMjFCrVi3cuXNH2n/lyhU0bNgQ+vr6sLS0xJgxY1K9XC6TyVC0aFGYm5ujdevWGDNmDM6ePYvo6GgoFArMnz8fxYsXh66uLmrUqAEPDw+pb1xcHEaNGoVixYpBT08PJUuWxJIlS5SO/f02wPdizsbGBjKZDE2aNAGgfDVk48aNMDc3h0KhUMqxY8eOGDhwoLR95MgR1KxZE3p6eihVqhTmzZuX6geLlpYWihYtCgsLCzg4OKB79+44c+aMtF8ul+OXX36BtbU19PX1Ub58eaxevVraP3fuXGzbtg1HjhyRrq5cvHgRAPD27Vs4OTnBxMQEBQsWRMeOHeHn55diPm5ubmjfvn2i+PeflZIlS2L48OFwcHDA0aNHAQBfv35Fv379UKBAARgYGKB169Z4/vy51PfNmzdo3749ChQogHz58qFy5co4ceIEAOXbUhcvXsSAAQMQGhoqvZe5c+cCUL5l1KtXLzg7OyvlFx8fD1NTU2zfvh0AoFAosGTJEum8Va9eHfv370/xvf9bWv8dHD58GGXLloWenh4cHR3x9u1bpf3p+blIzbx58zB+/HhUrVo12TYtWrRAcHAwLl269J9ei3IfFjeUJ2loaOCPP/7A48ePsW3bNpw/fx6//vprsu179+6N4sWL4/bt27h79y6mTp0KbW1tAMDLly/RqlUrdO3aFQ8ePIC7uzuuXLmCUaNGqZSTvr4+FAoFEhISsHr1aixfvhy///47Hjx4AEdHR3To0EH6QP3jjz9w9OhR7N27F76+vti1axesrKySPO6tW7cAAGfPnkVAQAAOHjyYqE337t3x5csXXLhwQYoFBwfDw8MDvXv3BgB4enqiX79+GDt2LHx8fPDXX3/B1dUVixYtSvN79PPzw6lTp6CjoyPFFAoFihcvjn379sHHxwezZ8/G9OnTsXfvXgDApEmT4OTkpHQVzt7eHvHx8XB0dIShoSE8PT1x9epV5M+fH61atUJcXFySrx8cHAwfHx/Y2tqmmqu+vr50nP79++POnTs4evQorl+/DiEE2rRpg/j4eADAyJEjERsbi8uXL+Phw4f47bffkD9//kTHtLe3x6pVq6SrdgEBAZg0aVKidr1798Y///yjVGicOnUKUVFR6Ny5MwBgyZIl2L59OzZs2IDHjx9j/Pjx6NOnj0of9Gn5dxAVFYVFixZh+/btuHr1KkJCQtCjRw9pf3p+Lpo0aYL+/funOc/k6OjooEaNGvD09PzPx6JcRu3rjhNlERcXF6GpqSny5csnfXXr1i3Jtvv27ROFChWStrdu3SqMjY2lbUNDQ+Hq6ppk319++UUMGTJEKebp6Sk0NDREdHR0kn3+ffxnz56JcuXKCVtbWyGEEObm5mLRokVKfWrXri1GjBghhBBi9OjRolmzZkKhUCR5fADi0KFDQgghXr9+LQAILy8vpTYuLi6iY8eO0nbHjh3FwIEDpe2//vpLmJubC7lcLoQQonnz5mLx4sVKx9ixY4coVqxYkjkIIcScOXOEhoaGyJcvn9DT0xMABACxYsWKZPsIIcTIkSNF165dk831+2uXL19e6RzExsYKfX19cerUqSSP6+XlJQAIf39/pfjPx1coFOLMmTNCV1dXTJo0STx79kwAEFevXpXaf/78Wejr64u9e/cKIYSoWrWqmDt3bpKveeHCBQFAfP36VQiR+P/9dyVLlhQrV64UQggRHx8vTE1Nxfbt26X9PXv2FM7OzkIIIWJiYoSBgYG4du2a0jF++eUX0bNnzyTz+PdrJCWpfwcAxI0bN6TYkydPBABx8+ZNIUTafi5+/nkUQoi+ffuKqVOnJpvHz5I7X9917txZ9O/fP03HorxDK6uKKqLM0LRpU6xfv17azpcvH4BvVzGWLFmCp0+fIiwsDAkJCYiJiUFUVFSS4x4mTJiAQYMGYceOHdKtldKlSwP4dsvqwYMH2LVrl9ReCAGFQoHXr1+jYsWKSeYWGhqK/PnzQ6FQICYmBg0aNMCmTZsQFhaGDx8+oH79+krt69evj/v37wP4diWhRYsWKF++PFq1aoV27dqhZcuW/+lc9e7dG4MHD8aff/4JXV1d7Nq1Cz169ICGhob0Pq9evar0F7lcLk/xvAFA+fLlcfToUcTExGDnzp3w9vbG6NGjldqsW7cOW7Zsgb+/P6KjoxEXF4caNWqkmO/9+/fx4sULGBoaKsVjYmLw8uXLJPtER0cDAPT09BLtO3bsGPLnz4/4+HgoFAr06tULc+fOxblz56ClpQU7OzupbaFChVC+fHk8efIEADBmzBgMHz4cp0+fhoODA7p27Ypq1aqlmH9KtLS04OTkhF27dqFv376IjIzEkSNH4ObmBgB48eIFoqKi0KJFC6V+cXFxsLGxSfPrpOXfgZaWFmrXri31qVChAkxMTPDkyRPUqVMnXT8X32+tZQR9fX1ERUVl2PEod2BxQ7lavnz5UKZMGaWYn58f2rVrh+HDh2PRokUoWLAgrly5gl9++QVxcXFJ/jKeO3cuevXqhePHj+PkyZOYM2cO3Nzc0LlzZ0RERGDo0KEYM2ZMon4lSpRINjdDQ0Pcu3cPGhoaKFasGPT19QEAYWFhqb6vmjVr4vXr1zh58iTOnj0LJycnODg4qDzm4mft27eHEALHjx9H7dq14enpiZUrV0r7IyIiMG/ePHTp0iVR36SKhe++P30EAEuXLkXbtm0xb948LFiwAMC3MTCTJk3C8uXLUa9ePRgaGmLZsmW4efNmivlGRESgVq1aSkXld4ULF06yj6mpKYBvY2j+3eZ7IayjowNzc3NoaaX91+OgQYPg6OiI48eP4/Tp01iyZAmWL1+eqIhTRe/evdG4cWN8/PgRZ86cgb6+Plq1agUA0u2q48ePw8LCQqlfagPJv0vPv4OkpPfnIqMEBwdLf2gQfcfihvKcu3fvQqFQYPny5dJVie/jO1JSrlw5lCtXDuPHj0fPnj2xdetWdO7cGTVr1oSPj0+iIio1GhoaSfYxMjKCubk5rl69isaNG0vxq1evok6dOkrtnJ2d4ezsjG7duqFVq1YIDg5GwYIFlY73fXyLXC5PMR89PT106dIFu3btwosXL1C+fHnUrFlT2l+zZk34+vqq/D7/bebMmWjWrBmGDx8uvU97e3uMGDFCavPvKy86OjqJ8q9Zsybc3d1hZmYGIyOjNL126dKlYWRkBB8fH5QrV05pX1KFMABUrFgRCQkJuHnzJuzt7QEAX758ga+vLypVqiS1s7S0xLBhwzBs2DBMmzYNf//9d5LFTVLvJSn29vawtLSEu7s7Tp48ie7du0vjvCpVqgRdXV34+/sr/YyoIq3/DhISEnDnzh3pZ8/X1xchISHSFcmM+rlIr0ePHqFbt25Z8tqUfXFAMeU5ZcqUQXx8PNasWYNXr15hx44d2LBhQ7Lto6OjMWrUKFy8eBFv3rzB1atXcfv2bemX+5QpU3Dt2jWMGjUK3t7eeP78OY4cOaLygOKfTZ48Gb/99hvc3d3h6+uLqVOnwtvbG2PHjgXw7SmXPXv24OnTp3j27Bn27duHokWLJvlot5mZGfT19eHh4YGgoCCEhoYm+7q9e/fG8ePHsWXLFmkg8XezZ8/G9u3bMW/ePDx+/BhPnjyBm5sbZs6cqdJ7q1evHqpVq4bFixcDAMqWLYs7d+7g1KlTePbsGWbNmoXbt28r9bGyssKDBw/g6+uLz58/Iz4+Hr1794apqSk6duwIT09PvH79GhcvXsSYMWPw7t27JF9bQ0MDDg4OuHLlSprzLVu2LDp27IjBgwfjypUruH//Pvr06QMLCwt07NgRADBu3DicOnUKr1+/xr1793DhwoVkb0daWVkhIiIC586dw+fPn1O8pdKrVy9s2LABZ86cUfr/YWhoiEmTJmH8+PHYtm0bXr58iXv37mHNmjXYtm1bmt5XWv8daGtrY/To0bh58ybu3r2L/v37o27dulKxk56fi379+mHatGkp5ufv7w9vb2/4+/tDLpfD29sb3t7eSoOs/fz88P79ezg4OKTpPVMektWDfojUJalBqN+tWLFCFCtWTOjr6wtHR0exffv2ZAd9xsbGih49eghLS0uho6MjzM3NxahRo5QGC9+6dUu0aNFC5M+fX+TLl09Uq1Yt0YDgn6U2SFIul4u5c+cKCwsLoa2tLapXry5Onjwp7d+4caOoUaOGyPd/7dw/i+JAHMbxcUGCLKigFiIiKGJpp2BjoeCr8E8jKPgCxMbO2lp7rWzFTnwDlhaChTaCiGAngj5b3J3s6np33bLZ7wfSJGQymZnAQ/Ijr69yu93K5XKaz+e34+augLPf7yscDuvl5UXZbPbp+FwuFwWDQRljtFqtHvo1mUyUyWTkcrnkdruVSqXU6/We3ke73VYymXzYPxwOZVmWNpuNTqeTKpWKPB6PvF6v6vW6ms3mh/N2u91tfI0xmk6nkqTtdqtSqSS/3y/LshSNRlWtVnU8Hp/2aTweKxQK3Qqln43Fe4fDQcViUR6P57Zmlsvl7Xij0VAsFpNlWQoEAioWi9rv95IeC4olqVaryefzyRijdrst6fNi38ViIWOMIpHIQ/H49XpVt9tVIpGQ0+lUIBBQoVDQbDZ7eh/31/jf52A0GikajcqyLOXzea3X6w/t/mtd3K/HbDarcrn8tJ/SrzkxvwvQ329/5l6SOp2OCoXCX9vBz+SQpK8IVQDwFSSZdDp9+7yI7+l8Ppt4PG4Gg8FD8T3AZykAP4rD4TC9Xo+/2n5zm83GtFotgg0+xZsbAABgK7y5AQAAtkK4AQAAtkK4AQAAtkK4AQAAtkK4AQAAtkK4AQAAtkK4AQAAtkK4AQAAtkK4AQAAtvIGkeJJ/EwX2AMAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from apt.risk.data_assessment.dataset_attack_membership_knn_probabilities import \\\n", + " DatasetAttackConfigMembershipKnnProbabilities, DatasetAttackMembershipKnnProbabilities\n", + "\n", + "dataset_name = \"nursery_kde\"\n", + "\n", + "config_g = DatasetAttackConfigMembershipKnnProbabilities(use_batches=True,\n", + " generate_plot=True)\n", + "attack_g = DatasetAttackMembershipKnnProbabilities(original_data_members,\n", + " original_data_non_members,\n", + " synthetic_data,\n", + " config_g,\n", + " dataset_name)\n", + "\n", + "score_g = attack_g.assess_privacy()\n", + "score_g" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "### DatasetAttackWholeDatasetKnnDistance\n", + "Run the privacy risk assessment for synthetic datasets based on distances of synthetic data records from\n", + "members (training set) and non-members (holdout set). \n", + "\n", + "The privacy risk measure is the share of synthetic\n", + "records closer to the training than the holdout dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetAttackScoreWholeDatasetKnnDistance(dataset_name='nursery_kde', risk_score=0.841, result=None, share=0.841, assessment_type='WholeDatasetKnnDistance')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \\\n", + " DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance\n", + " \n", + "config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)\n", + "attack_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members,\n", + " synthetic_data, config_h, dataset_name)\n", + "\n", + "score_h = attack_h.assess_privacy()\n", + "score_h" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv1", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "a7b572376dda99aaa0cfb20ab0ebad1d786e8d83835a737650854479888cdec3" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/requirements.txt b/requirements.txt index b6f5d56..2a438f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,8 @@ tensorflow==2.8.3 xgboost==1.7.2 Pillow==9.3.0 sortedcontainers==2.4.0 + +#notebooks +notebook +jupyter +ipywidgets From 04d8df8091bd3bc3a6fb80e0e36ee1b24fa62bab Mon Sep 17 00:00:00 2001 From: abigailgold <57357634+abigailgold@users.noreply.github.com> Date: Mon, 8 May 2023 12:50:55 +0300 Subject: [PATCH 04/11] Increase version to 0.2.0 (#74) * Remove tensorflow dependency if not using keras model * Remove xgboost dependency if not using xgboost model * Documentation updates Signed-off-by: abigailt --- apt/__init__.py | 2 +- apt/risk/__init__.py | 1 + .../data_assessment/attack_strategy_utils.py | 18 ++-- .../dataset_assessment_manager.py | 20 +++-- apt/risk/data_assessment/dataset_attack.py | 56 ++++++------ ...set_attack_membership_knn_probabilities.py | 86 +++++++++---------- .../data_assessment/dataset_attack_result.py | 16 ++++ ...taset_attack_whole_dataset_knn_distance.py | 54 ++++++------ apt/utils/models/keras_model.py | 9 +- apt/utils/models/pytorch_model.py | 68 +++++++++------ apt/utils/models/xgboost_model.py | 3 +- docs/conf.py | 4 +- docs/index.rst | 2 + docs/source/apt.anonymization.rst | 1 - docs/source/apt.minimization.rst | 1 - docs/source/apt.risk.data_assessment.rst | 61 +++++++++++++ docs/source/apt.risk.rst | 18 ++++ docs/source/apt.rst | 1 + docs/source/apt.utils.datasets.rst | 1 - docs/source/apt.utils.models.rst | 23 +++++ docs/source/apt.utils.rst | 1 - requirements.txt | 3 + setup.cfg | 2 +- tests/test_minimizer.py | 3 + tests/test_model.py | 4 +- 25 files changed, 306 insertions(+), 152 deletions(-) create mode 100644 apt/risk/__init__.py create mode 100644 docs/source/apt.risk.data_assessment.rst create mode 100644 docs/source/apt.risk.rst diff --git a/apt/__init__.py b/apt/__init__.py index ae1d0d0..7aba4bf 100644 --- a/apt/__init__.py +++ b/apt/__init__.py @@ -6,4 +6,4 @@ from apt import anonymization from apt import minimization from apt import utils -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/apt/risk/__init__.py b/apt/risk/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/apt/risk/__init__.py @@ -0,0 +1 @@ + diff --git a/apt/risk/data_assessment/attack_strategy_utils.py b/apt/risk/data_assessment/attack_strategy_utils.py index 674feff..b0be8a1 100644 --- a/apt/risk/data_assessment/attack_strategy_utils.py +++ b/apt/risk/data_assessment/attack_strategy_utils.py @@ -9,21 +9,20 @@ from apt.utils.datasets import ArrayDataset class AttackStrategyUtils(abc.ABC): """ - Abstract base class for common utilities of various privacy attack strategies. + Abstract base class for common utilities of various privacy attack strategies. """ pass class KNNAttackStrategyUtils(AttackStrategyUtils): """ - Common utilities for attack strategy based on KNN distances. + Common utilities for attack strategy based on KNN distances. + + :param use_batches: Use batches with a progress meter or not when finding KNNs for query set. + :param batch_size: if use_batches=True, the size of batch_size should be > 0. """ def __init__(self, use_batches: bool = False, batch_size: int = 10) -> None: - """ - :param use_batches: Use batches with a progress meter or not when finding KNNs for query set - :param batch_size: if use_batches=True, the size of batch_size should be > 0 - """ self.use_batches = use_batches self.batch_size = batch_size if use_batches: @@ -31,11 +30,18 @@ class KNNAttackStrategyUtils(AttackStrategyUtils): raise ValueError(f"When using batching batch_size should be > 0, and not {batch_size}") def fit(self, knn_learner: NearestNeighbors, dataset: ArrayDataset): + """ + Fit the KNN learner. + + :param knn_learner: The KNN model to fit. + :param dataset: The training set to fit the model on. + """ knn_learner.fit(dataset.get_samples()) def find_knn(self, knn_learner: NearestNeighbors, query_samples: ArrayDataset, distance_processor=None): """ Nearest neighbor search function. + :param query_samples: query samples, to which nearest neighbors are to be found :param knn_learner: unsupervised learner for implementing neighbor searches, after it was fitted :param distance_processor: function for processing the distance into another more relevant metric per sample. diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py index 78beeef..a94fe70 100644 --- a/apt/risk/data_assessment/dataset_assessment_manager.py +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -15,6 +15,12 @@ from apt.utils.datasets import ArrayDataset @dataclass class DatasetAssessmentManagerConfig: + """ + Configuration for DatasetAssessmentManager. + + :param persist_reports: Whether to save assessment results to filesystem. + :param generate_plots: Whether to generate and visualize plots as part of assessment. + """ persist_reports: bool = False generate_plots: bool = False @@ -22,14 +28,13 @@ class DatasetAssessmentManagerConfig: class DatasetAssessmentManager: """ The main class for running dataset assessment attacks. + + :param config: Configuration parameters to guide the dataset assessment process """ attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = [] attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = [] def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None: - """ - :param config: Configuration parameters to guide the dataset assessment process - """ self.config = config def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, @@ -67,14 +72,17 @@ class DatasetAssessmentManager: return [score_gl, score_h] def dump_all_scores_to_files(self): + """ + Save assessment results to filesystem. + """ if self.config.persist_reports: results_log_file = "_results.log.csv" - self.dump_scores_to_file(self.attack_scores_per_record_knn_probabilities, + self._dump_scores_to_file(self.attack_scores_per_record_knn_probabilities, "per_record_knn_probabilities" + results_log_file, True) - self.dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance, + self._dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance, "whole_dataset_knn_distance" + results_log_file, True) @staticmethod - def dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool): + def _dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool): run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore') # don't serialize result run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w') # Overwrite diff --git a/apt/risk/data_assessment/dataset_attack.py b/apt/risk/data_assessment/dataset_attack.py index e057c8a..76b6330 100644 --- a/apt/risk/data_assessment/dataset_attack.py +++ b/apt/risk/data_assessment/dataset_attack.py @@ -16,32 +16,30 @@ from apt.utils.datasets import ArrayDataset class Config(abc.ABC): """ - The base class for dataset attack configurations + The base class for dataset attack configurations """ pass class DatasetAttack(abc.ABC): """ - The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model - training. The original data members (training data) and non-members (the holdout data) should be available. - For reliability, all the datasets should be preprocessed and normalized. + The interface for performing privacy attack for risk assessment of synthetic datasets to be used in AI model + training. The original data members (training data) and non-members (the holdout data) should be available. + For reliability, all the datasets should be preprocessed and normalized. + + :param original_data_members: A container for the training original samples and labels, + only samples are used in the assessment + :param original_data_non_members: A container for the holdout original samples and labels, + only samples are used in the assessment + :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment + :param config: Configuration parameters to guide the assessment process + :param dataset_name: A name to identify the dataset under attack, optional + :param attack_strategy_utils: Utils for use with the attack strategy, optional """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, config: Config, dataset_name: str, attack_strategy_utils: Optional[AttackStrategyUtils] = None) -> None: - """ - :param original_data_members: A container for the training original samples and labels, - only samples are used in the assessment - :param original_data_non_members: A container for the holdout original samples and labels, - only samples are used in the assessment - :param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment - :param config: Configuration parameters to guide the assessment process - :param dataset_name: A name to identify the dataset under attack, optional - :param attack_strategy_utils: Utils for use with the attack strategy, optional - """ - self.original_data_members = original_data_members self.original_data_non_members = original_data_non_members self.synthetic_data = synthetic_data @@ -52,7 +50,8 @@ class DatasetAttack(abc.ABC): @abc.abstractmethod def assess_privacy(self) -> DatasetAttackScore: """ - Assess the privacy of the dataset + Assess the privacy of the dataset. + :return: score: DatasetAttackScore the privacy attack risk score """ @@ -61,14 +60,15 @@ class DatasetAttack(abc.ABC): class DatasetAttackMembership(DatasetAttack): """ - An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level. + An abstract base class for performing privacy risk assessment for synthetic datasets on a per-record level. """ @abc.abstractmethod def calculate_privacy_score(self, dataset_attack_result: DatasetAttackResultMembership, generate_plot: bool = False) -> DatasetAttackScore: """ - Calculate dataset privacy score based on the result of the privacy attack + Calculate dataset privacy score based on the result of the privacy attack. + :return: score: DatasetAttackScore """ @@ -78,15 +78,16 @@ class DatasetAttackMembership(DatasetAttack): def plot_roc_curve(dataset_name: str, member_probabilities: np.ndarray, non_member_probabilities: np.ndarray, filename_prefix: str = ""): """ - Plot ROC curve - :param dataset_name: dataset name, will become part of the plot filename - :param member_probabilities: probability estimates of the member samples, the training data - :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data - :param filename_prefix: name prefix for the ROC curve plot + Plot ROC curve. + + :param dataset_name: dataset name, will become part of the plot filename. + :param member_probabilities: probability estimates of the member samples, the training data. + :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data. + :param filename_prefix: name prefix for the ROC curve plot. """ labels = np.concatenate((np.zeros((len(non_member_probabilities),)), np.ones((len(member_probabilities),)))) results = np.concatenate((non_member_probabilities, member_probabilities)) - svc_disp = RocCurveDisplay.from_predictions(labels, results) + RocCurveDisplay.from_predictions(labels, results) plt.plot([0, 1], [0, 1], color="navy", linewidth=2, linestyle="--", label='No skills') plt.title('ROC curve') plt.savefig(f'{filename_prefix}{dataset_name}_roc_curve.png') @@ -94,9 +95,10 @@ class DatasetAttackMembership(DatasetAttack): @staticmethod def calculate_metrics(member_probabilities: np.ndarray, non_member_probabilities: np.ndarray): """ - Calculate attack performance metrics - :param member_probabilities: probability estimates of the member samples, the training data - :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data + Calculate attack performance metrics. + + :param member_probabilities: probability estimates of the member samples, the training data. + :param non_member_probabilities: probability estimates of the non-member samples, the hold-out data. :return: fpr: False Positive rate tpr: True Positive rate diff --git a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py index 7779b17..374ff18 100644 --- a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py +++ b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py @@ -19,18 +19,18 @@ from apt.utils.datasets import ArrayDataset @dataclass class DatasetAttackConfigMembershipKnnProbabilities(Config): - """Configuration for DatasetAttackMembershipKnnProbabilities. + """ + Configuration for DatasetAttackMembershipKnnProbabilities. - Attributes: - k: Number of nearest neighbors to search - use_batches: Divide query samples into batches or not. - batch_size: Query sample batch size. - compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return - one value indicating the distance between those vectors. - See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. - distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in - sklearn.neighbors.NearestNeighbors documentation. - generate_plot: Generate or not an AUR ROC curve and persist it in a file + :param k: Number of nearest neighbors to search. + :param use_batches: Divide query samples into batches or not. + :param batch_size: Query sample batch size. + :param compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must + return one value indicating the distance between those vectors. + See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. + :param distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in + sklearn.neighbors.NearestNeighbors documentation. + :param generate_plot: Generate or not an AUR ROC curve and persist it in a file. """ k: int = 5 use_batches: bool = False @@ -42,7 +42,14 @@ class DatasetAttackConfigMembershipKnnProbabilities(Config): @dataclass class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore): - """DatasetAttackMembershipKnnProbabilities privacy risk score. + """ + DatasetAttackMembershipKnnProbabilities privacy risk score. + + :param dataset_name: dataset name to be used in reports + :param roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the + attack performance. + :param average_precision_score: the proportion of predicted members that are correctly members. + :param result: the result of the membership inference attack. """ roc_auc_score: float average_precision_score: float @@ -50,13 +57,6 @@ class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore): def __init__(self, dataset_name: str, roc_auc_score: float, average_precision_score: float, result: DatasetAttackResultMembership) -> None: - """ - dataset_name: dataset name to be used in reports - roc_auc_score: the area under the receiver operating characteristic curve (AUC ROC) to evaluate the attack - performance. - average_precision_score: the proportion of predicted members that are correctly members - result: the result of the membership inference attack - """ super().__init__(dataset_name=dataset_name, risk_score=roc_auc_score, result=result) self.roc_auc_score = roc_auc_score self.average_precision_score = average_precision_score @@ -64,24 +64,23 @@ class DatasetAttackScoreMembershipKnnProbabilities(DatasetAttackScore): class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): """ - Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of - members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset. - By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided - in configuration instead. - The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure. + Privacy risk assessment for synthetic datasets based on Black-Box MIA attack using distances of + members (training set) and non-members (holdout set) from their nearest neighbors in the synthetic dataset. + By default, the Euclidean distance is used (L2 norm), but another ``compute_distance()`` method can be provided + in configuration instead. + The area under the receiver operating characteristic curve (AUC ROC) gives the privacy risk measure. + + :param original_data_members: A container for the training original samples and labels + :param original_data_non_members: A container for the holdout original samples and labels + :param synthetic_data: A container for the synthetic samples and labels + :param config: Configuration parameters to guide the attack, optional + :param dataset_name: A name to identify this dataset, optional """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, config: DatasetAttackConfigMembershipKnnProbabilities = DatasetAttackConfigMembershipKnnProbabilities(), dataset_name: str = DEFAULT_DATASET_NAME): - """ - :param original_data_members: A container for the training original samples and labels - :param original_data_non_members: A container for the holdout original samples and labels - :param synthetic_data: A container for the synthetic samples and labels - :param config: Configuration parameters to guide the attack, optional - :param dataset_name: A name to identify this dataset, optional - """ attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name, attack_strategy_utils) @@ -103,10 +102,9 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): by the Parzen window density estimation in ``probability_per_sample()``, computed from the NN distances from the query samples to the synthetic data samples. - :return: - Privacy score of the attack together with the attack result with the probabilities of member and - non-member samples to be generated by the synthetic data generator based on the NN distances from the - query samples to the synthetic data samples + :return: Privacy score of the attack together with the attack result with the probabilities of member and + non-member samples to be generated by the synthetic data generator based on the NN distances from the + query samples to the synthetic data samples """ # nearest neighbor search self.attack_strategy_utils.fit(self.knn_learner, self.synthetic_data) @@ -130,11 +128,11 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): """ Evaluate privacy score from the probabilities of member and non-member samples to be generated by the synthetic data generator. The probabilities are computed by the ``assess_privacy()`` method. - :param dataset_attack_result attack result containing probabilities of member and non-member samples to be - generated by the synthetic data generator - :param generate_plot generate AUC ROC curve plot and persist it - :return: - score of the attack, based on distance-based probabilities - mainly the ROC AUC score + + :param dataset_attack_result: attack result containing probabilities of member and non-member samples to be + generated by the synthetic data generator. + :param generate_plot: generate AUC ROC curve plot and persist it. + :return: score of the attack, based on distance-based probabilities - mainly the ROC AUC score. """ member_proba, non_member_proba = \ dataset_attack_result.member_probabilities, dataset_attack_result.non_member_probabilities @@ -151,10 +149,10 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): """ For every sample represented by its distance from the query sample to its KNN in synthetic data, computes the probability of the synthetic data to be part of the query dataset. + :param distances: distance between every query sample in batch to its KNNs among synthetic samples, a numpy - array of size (n, k) with n being the number of samples, k - the number of KNNs - :return: - probability estimates of the query samples being generated and so - of being part of the synthetic set, a - numpy array of size (n,) + array of size (n, k) with n being the number of samples, k - the number of KNNs. + :return: probability estimates of the query samples being generated and so - of being part of the synthetic set, + a numpy array of size (n,) """ return np.average(np.exp(-distances), axis=1) diff --git a/apt/risk/data_assessment/dataset_attack_result.py b/apt/risk/data_assessment/dataset_attack_result.py index 0ed0bd4..afd4b36 100644 --- a/apt/risk/data_assessment/dataset_attack_result.py +++ b/apt/risk/data_assessment/dataset_attack_result.py @@ -8,11 +8,21 @@ DEFAULT_DATASET_NAME = "dataset" @dataclass class DatasetAttackResult: + """ + Basic class for storing privacy risk assessment results. + """ pass @dataclass class DatasetAttackScore: + """ + Basic class for storing privacy risk assessment scores. + + :param dataset_name: The name of the dataset that was assessed. + :param risk_score: The privacy risk score. + :param result: An optional list of more detailed results. + """ dataset_name: str risk_score: float result: Optional[DatasetAttackResult] @@ -20,5 +30,11 @@ class DatasetAttackScore: @dataclass class DatasetAttackResultMembership(DatasetAttackResult): + """ + Class for storing membership attack results. + + :param member_probabilities: The attack probabilities for member samples. + :param non_member_probabilities: The attack probabilities for non-member samples. + """ member_probabilities: np.ndarray non_member_probabilities: np.ndarray diff --git a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py index 1a57bbd..6dea1d5 100644 --- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -20,16 +20,16 @@ K = 1 # Number of nearest neighbors to search. For DCR we need only the nearest @dataclass class DatasetAttackConfigWholeDatasetKnnDistance(Config): - """Configuration for DatasetAttackWholeDatasetKnnDistance. + """ + Configuration for DatasetAttackWholeDatasetKnnDistance. - Attributes: - use_batches: Divide query samples into batches or not. - batch_size: Query sample batch size. - compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must return - one value indicating the distance between those vectors. - See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. - distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in - sklearn.neighbors.NearestNeighbors documentation. + :param use_batches: Divide query samples into batches or not. + :param batch_size: Query sample batch size. + :param compute_distance: A callable function, which takes two arrays representing 1D vectors as inputs and must + return one value indicating the distance between those vectors. + See 'metric' parameter in sklearn.neighbors.NearestNeighbors documentation. + :param distance_params: Additional keyword arguments for the distance computation function, see 'metric_params' in + sklearn.neighbors.NearestNeighbors documentation. """ use_batches: bool = False batch_size: int = 10 @@ -39,41 +39,40 @@ class DatasetAttackConfigWholeDatasetKnnDistance(Config): @dataclass class DatasetAttackScoreWholeDatasetKnnDistance(DatasetAttackScore): - """DatasetAttackWholeDatasetKnnDistance privacy risk score. + """ + DatasetAttackWholeDatasetKnnDistance privacy risk score. + + :param dataset_name: Dataset name to be used in reports. + :param share: The share of synthetic records closer to the training than the holdout dataset. + A value of 0.5 or close to it means good privacy. """ share: float assessment_type: str = 'WholeDatasetKnnDistance' # to be used in reports def __init__(self, dataset_name: str, share: float) -> None: - """ - dataset_name: dataset name to be used in reports - share : the share of synthetic records closer to the training than the holdout dataset. - A value of 0.5 or close to it means good privacy. - """ super().__init__(dataset_name=dataset_name, risk_score=share, result=None) self.share = share class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): """ - Privacy risk assessment for synthetic datasets based on distances of synthetic data records from - members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic - records closer to the training than the holdout dataset. - By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in - configuration instead. + Privacy risk assessment for synthetic datasets based on distances of synthetic data records from + members (training set) and non-members (holdout set). The privacy risk measure is the share of synthetic + records closer to the training than the holdout dataset. + By default, the Euclidean distance is used (L2 norm), but another compute_distance() method can be provided in + configuration instead. + + :param original_data_members: A container for the training original samples and labels. + :param original_data_non_members: A container for the holdout original samples and labels. + :param synthetic_data: A container for the synthetic samples and labels. + :param config: Configuration parameters to guide the assessment process, optional. + :param dataset_name: A name to identify this dataset, optional. """ def __init__(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, synthetic_data: ArrayDataset, config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(), dataset_name: str = DEFAULT_DATASET_NAME): - """ - :param original_data_members: A container for the training original samples and labels - :param original_data_non_members: A container for the holdout original samples and labels - :param synthetic_data: A container for the synthetic samples and labels - :param config: Configuration parameters to guide the assessment process, optional - :param dataset_name: A name to identify this dataset, optional - """ attack_strategy_utils = KNNAttackStrategyUtils(config.use_batches, config.batch_size) super().__init__(original_data_members, original_data_non_members, synthetic_data, config, dataset_name, attack_strategy_utils) @@ -90,6 +89,7 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): """ Calculate the share of synthetic records closer to the training than the holdout dataset, based on the DCR computed by 'calculate_distances()'. + :return: score of the attack, based on the NN distances from the query samples to the synthetic data samples """ diff --git a/apt/utils/models/keras_model.py b/apt/utils/models/keras_model.py index 4f64ed7..6f89a35 100644 --- a/apt/utils/models/keras_model.py +++ b/apt/utils/models/keras_model.py @@ -2,9 +2,6 @@ from typing import Optional import numpy as np -import tensorflow as tf -from tensorflow import keras - from sklearn.metrics import mean_squared_error from apt.utils.models import Model, ModelOutputType, ScoringMethod, check_correct_model_output @@ -14,8 +11,6 @@ from art.utils import check_and_transform_label_format from art.estimators.classification.keras import KerasClassifier as ArtKerasClassifier from art.estimators.regression.keras import KerasRegressor as ArtKerasRegressor -tf.compat.v1.disable_eager_execution() - class KerasModel(Model): """ @@ -41,7 +36,7 @@ class KerasClassifier(KerasModel): queries that can be submitted. Default is True. :type unlimited_queries: boolean, optional """ - def __init__(self, model: keras.models.Model, output_type: ModelOutputType, black_box_access: Optional[bool] = True, + def __init__(self, model: "keras.models.Model", output_type: ModelOutputType, black_box_access: Optional[bool] = True, unlimited_queries: Optional[bool] = True, **kwargs): super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs) logits = False @@ -107,7 +102,7 @@ class KerasRegressor(KerasModel): queries that can be submitted. Default is True. :type unlimited_queries: boolean, optional """ - def __init__(self, model: keras.models.Model, black_box_access: Optional[bool] = True, + def __init__(self, model: "keras.models.Model", black_box_access: Optional[bool] = True, unlimited_queries: Optional[bool] = True, **kwargs): super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs) self._art_model = ArtKerasRegressor(model) diff --git a/apt/utils/models/pytorch_model.py b/apt/utils/models/pytorch_model.py index 3e8b550..a97fd33 100644 --- a/apt/utils/models/pytorch_model.py +++ b/apt/utils/models/pytorch_model.py @@ -31,7 +31,9 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): """ def get_step_correct(self, outputs, targets) -> int: - """get number of correctly classified labels""" + """ + Get number of correctly classified labels. + """ if len(outputs) != len(targets): raise ValueError("outputs and targets should be the same length.") if self.nb_classes > 1: @@ -40,7 +42,9 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): return int(torch.sum(torch.round(outputs, axis=-1) == targets).item()) def _eval(self, loader: DataLoader): - """inner function for model evaluation""" + """ + Inner function for model evaluation. + """ self.model.eval() total_loss = 0 @@ -74,19 +78,20 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): ) -> None: """ Fit the classifier on the training set `(x, y)`. + :param x: Training data. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or index labels - of shape (nb_samples,). + of shape (nb_samples,). :param x_validation: Validation data (optional). :param y_validation: Target validation values (class labels) one-hot-encoded of shape - (nb_samples, nb_classes) or index labels of shape (nb_samples,) (optional). + (nb_samples, nb_classes) or index labels of shape (nb_samples,) (optional). :param batch_size: Size of batches. :param nb_epochs: Number of epochs to use for training. :param save_checkpoints: Boolean, save checkpoints if True. :param save_entire_model: Boolean, save entire model if True, else save state dict. :param path: path for saving checkpoint. :param kwargs: Dictionary of framework-specific arguments. This parameter is not currently - supported for PyTorch and providing it takes no effect. + supported for PyTorch and providing it takes no effect. """ # Put the model in the training mode self._model.train() @@ -153,7 +158,8 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def save_checkpoint_state_dict(self, is_best: bool, path=os.getcwd(), filename="latest.tar") -> None: """ - Saves checkpoint as latest.tar or best.tar + Saves checkpoint as latest.tar or best.tar. + :param is_best: whether the model is the best achieved model :param path: path for saving checkpoint :param filename: checkpoint name @@ -176,7 +182,8 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def save_checkpoint_model(self, is_best: bool, path=os.getcwd(), filename="latest.tar") -> None: """ - Saves checkpoint as latest.tar or best.tar + Saves checkpoint as latest.tar or best.tar. + :param is_best: whether the model is the best achieved model :param path: path for saving checkpoint :param filename: checkpoint name @@ -194,7 +201,8 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def load_checkpoint_state_dict_by_path(self, model_name: str, path: str = None): """ - Load model only based on the check point path + Load model only based on the check point path. + :param model_name: check point filename :param path: checkpoint path (default current work dir) :return: loaded model @@ -219,21 +227,24 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def load_latest_state_dict_checkpoint(self): """ - Load model state dict only based on the check point path (latest.tar) + Load model state dict only based on the check point path (latest.tar). + :return: loaded model """ self.load_checkpoint_state_dict_by_path("latest.tar") def load_best_state_dict_checkpoint(self): """ - Load model state dict only based on the check point path (model_best.tar) + Load model state dict only based on the check point path (model_best.tar). + :return: loaded model """ self.load_checkpoint_state_dict_by_path("model_best.tar") def load_checkpoint_model_by_path(self, model_name: str, path: str = None): """ - Load model only based on the check point path + Load model only based on the check point path. + :param model_name: check point filename :param path: checkpoint path (default current work dir) :return: loaded model @@ -254,14 +265,16 @@ class PyTorchClassifierWrapper(ArtPyTorchClassifier): def load_latest_model_checkpoint(self): """ - Load entire model only based on the check point path (latest.tar) + Load entire model only based on the check point path (latest.tar). + :return: loaded model """ self.load_checkpoint_model_by_path("latest.tar") def load_best_model_checkpoint(self): """ - Load entire model only based on the check point path (model_best.tar) + Load entire model only based on the check point path (model_best.tar). + :return: loaded model """ self.load_checkpoint_model_by_path("model_best.tar") @@ -288,11 +301,11 @@ class PyTorchClassifier(PyTorchModel): Initialization specifically for the PyTorch-based implementation. :param model: PyTorch model. The output of the model can be logits, probabilities or anything else. Logits - output should be preferred where possible to ensure attack efficiency. + output should be preferred where possible to ensure attack efficiency. :param output_type: The type of output the model yields (vector/label only for classifiers, value for regressors) :param loss: The loss function for which to compute gradients for training. The target label must be raw - categorical, i.e. not converted to one-hot encoding. + categorical, i.e. not converted to one-hot encoding. :param input_shape: The shape of one input instance. :param optimizer: The optimizer used to train the classifier. :param black_box_access: Boolean describing the type of deployment of the model (when in production). @@ -311,7 +324,7 @@ class PyTorchClassifier(PyTorchModel): @property def loss(self): """ - The pytorch model's loss function + The pytorch model's loss function. :return: The pytorch model's loss function """ @@ -320,7 +333,7 @@ class PyTorchClassifier(PyTorchModel): @property def optimizer(self): """ - The pytorch model's optimizer + The pytorch model's optimizer. :return: The pytorch model's optimizer """ @@ -350,7 +363,7 @@ class PyTorchClassifier(PyTorchModel): :param save_entire_model: Boolean, save entire model if True, else save state dict. :param path: path for saving checkpoint. :param kwargs: Dictionary of framework-specific arguments. This parameter is not currently - supported for PyTorch and providing it takes no effect. + supported for PyTorch and providing it takes no effect. """ if validation_data is None: self._art_model.fit( @@ -390,6 +403,7 @@ class PyTorchClassifier(PyTorchModel): def score(self, test_data: PytorchData, **kwargs): """ Score the model using test data. + :param test_data: Test data. :type test_data: `PytorchData` :return: the score as float (between 0 and 1) @@ -400,7 +414,8 @@ class PyTorchClassifier(PyTorchModel): def load_checkpoint_state_dict_by_path(self, model_name: str, path: str = None): """ - Load model only based on the check point path + Load model only based on the check point path. + :param model_name: check point filename :param path: checkpoint path (default current work dir) :return: loaded model @@ -409,21 +424,24 @@ class PyTorchClassifier(PyTorchModel): def load_latest_state_dict_checkpoint(self): """ - Load model state dict only based on the check point path (latest.tar) + Load model state dict only based on the check point path (latest.tar). + :return: loaded model """ self._art_model.load_latest_state_dict_checkpoint() def load_best_state_dict_checkpoint(self): """ - Load model state dict only based on the check point path (model_best.tar) + Load model state dict only based on the check point path (model_best.tar). + :return: loaded model """ self._art_model.load_best_state_dict_checkpoint() def load_checkpoint_model_by_path(self, model_name: str, path: str = None): """ - Load model only based on the check point path + Load model only based on the check point path. + :param model_name: check point filename :param path: checkpoint path (default current work dir) :return: loaded model @@ -432,14 +450,16 @@ class PyTorchClassifier(PyTorchModel): def load_latest_model_checkpoint(self): """ - Load entire model only based on the check point path (latest.tar) + Load entire model only based on the check point path (latest.tar). + :return: loaded model """ self._art_model.load_latest_model_checkpoint() def load_best_model_checkpoint(self): """ - Load entire model only based on the check point path (model_best.tar) + Load entire model only based on the check point path (model_best.tar). + :return: loaded model """ self._art_model.load_best_model_checkpoint() diff --git a/apt/utils/models/xgboost_model.py b/apt/utils/models/xgboost_model.py index 2fdc9fe..85f9a89 100644 --- a/apt/utils/models/xgboost_model.py +++ b/apt/utils/models/xgboost_model.py @@ -3,7 +3,6 @@ from typing import Optional, Tuple from apt.utils.models import Model, ModelOutputType, ScoringMethod, check_correct_model_output, is_one_hot from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE -from xgboost import XGBClassifier import numpy as np from art.estimators.classification.xgboost import XGBoostClassifier as ArtXGBoostClassifier @@ -37,7 +36,7 @@ class XGBoostClassifier(XGBoostModel): queries that can be submitted. Default is True. :type unlimited_queries: boolean, optional """ - def __init__(self, model: XGBClassifier, output_type: ModelOutputType, input_shape: Tuple[int, ...], + def __init__(self, model: "xgboost.XGBClassifier", output_type: ModelOutputType, input_shape: Tuple[int, ...], nb_classes: int, black_box_access: Optional[bool] = True, unlimited_queries: Optional[bool] = True, **kwargs): super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs) diff --git a/docs/conf.py b/docs/conf.py index aa505c2..d0da43d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ copyright = '2021, IBM' author = 'Abigail Goldsteen' # The full version, including alpha/beta/rc tags -release = '0.1.0' +release = '0.2.0' master_doc = 'index' @@ -53,7 +53,7 @@ exclude_patterns = [] # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'pyramid' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/index.rst b/docs/index.rst index 6a1969d..0d26e63 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,6 +18,8 @@ minimization principle in GDPR for ML models. It enables to reduce the amount of personal data needed to perform predictions with a machine learning model, while still enabling the model to make accurate predictions. This is done by by removing or generalizing some of the input features. +The dataset risk assessment module implements a tool for privacy assessment of synthetic datasets that are to be used in AI model training. + .. toctree:: :maxdepth: 2 :caption: Getting Started: diff --git a/docs/source/apt.anonymization.rst b/docs/source/apt.anonymization.rst index 6453554..f0aea69 100644 --- a/docs/source/apt.anonymization.rst +++ b/docs/source/apt.anonymization.rst @@ -12,7 +12,6 @@ apt.anonymization.anonymizer module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/apt.minimization.rst b/docs/source/apt.minimization.rst index 417fc68..a84364e 100644 --- a/docs/source/apt.minimization.rst +++ b/docs/source/apt.minimization.rst @@ -12,7 +12,6 @@ apt.minimization.minimizer module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/apt.risk.data_assessment.rst b/docs/source/apt.risk.data_assessment.rst new file mode 100644 index 0000000..88c345c --- /dev/null +++ b/docs/source/apt.risk.data_assessment.rst @@ -0,0 +1,61 @@ +apt.risk.data\_assessment package +================================= + +Submodules +---------- + +apt.risk.data\_assessment.attack\_strategy\_utils module +-------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.attack_strategy_utils + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_assessment\_manager module +------------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.dataset_assessment_manager + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_attack module +------------------------------------------------ + +.. automodule:: apt.risk.data_assessment.dataset_attack + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_attack\_membership\_knn\_probabilities module +-------------------------------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.dataset_attack_membership_knn_probabilities + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_attack\_result module +-------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.dataset_attack_result + :members: + :undoc-members: + :show-inheritance: + +apt.risk.data\_assessment.dataset\_attack\_whole\_dataset\_knn\_distance module +------------------------------------------------------------------------------- + +.. automodule:: apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: apt.risk.data_assessment + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apt.risk.rst b/docs/source/apt.risk.rst new file mode 100644 index 0000000..565b3ed --- /dev/null +++ b/docs/source/apt.risk.rst @@ -0,0 +1,18 @@ +apt.risk package +================ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + apt.risk.data_assessment + +Module contents +--------------- + +.. automodule:: apt.risk + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/apt.rst b/docs/source/apt.rst index ebbf65f..b77eada 100644 --- a/docs/source/apt.rst +++ b/docs/source/apt.rst @@ -9,6 +9,7 @@ Subpackages apt.anonymization apt.minimization + apt.risk apt.utils Module contents diff --git a/docs/source/apt.utils.datasets.rst b/docs/source/apt.utils.datasets.rst index f40bbdf..b35ee0d 100644 --- a/docs/source/apt.utils.datasets.rst +++ b/docs/source/apt.utils.datasets.rst @@ -12,7 +12,6 @@ apt.utils.datasets.datasets module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/docs/source/apt.utils.models.rst b/docs/source/apt.utils.models.rst index de4a5b1..3caa93c 100644 --- a/docs/source/apt.utils.models.rst +++ b/docs/source/apt.utils.models.rst @@ -4,6 +4,14 @@ apt.utils.models package Submodules ---------- +apt.utils.models.keras\_model module +------------------------------------ + +.. automodule:: apt.utils.models.keras_model + :members: + :undoc-members: + :show-inheritance: + apt.utils.models.model module ----------------------------- @@ -12,6 +20,14 @@ apt.utils.models.model module :undoc-members: :show-inheritance: +apt.utils.models.pytorch\_model module +-------------------------------------- + +.. automodule:: apt.utils.models.pytorch_model + :members: + :undoc-members: + :show-inheritance: + apt.utils.models.sklearn\_model module -------------------------------------- @@ -20,6 +36,13 @@ apt.utils.models.sklearn\_model module :undoc-members: :show-inheritance: +apt.utils.models.xgboost\_model module +-------------------------------------- + +.. automodule:: apt.utils.models.xgboost_model + :members: + :undoc-members: + :show-inheritance: Module contents --------------- diff --git a/docs/source/apt.utils.rst b/docs/source/apt.utils.rst index 4a6ce11..4ae24d2 100644 --- a/docs/source/apt.utils.rst +++ b/docs/source/apt.utils.rst @@ -21,7 +21,6 @@ apt.utils.dataset\_utils module :undoc-members: :show-inheritance: - Module contents --------------- diff --git a/requirements.txt b/requirements.txt index 2a438f3..fec0e47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,6 @@ sortedcontainers==2.4.0 notebook jupyter ipywidgets + +#doc +sphinx_rtd_theme diff --git a/setup.cfg b/setup.cfg index 6820c91..77e9de1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [metadata] # replace with your username: name = ai-privacy-toolkit -version = 0.1.0 +version = 0.2.0 author = Abigail Goldsteen author_email = abigailt@il.ibm.com description = A toolkit for tools and techniques related to the privacy and compliance of AI models. diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index f3f7fa7..bd2f422 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -10,6 +10,7 @@ from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder +import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Input @@ -19,6 +20,8 @@ from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, g from apt.utils.datasets import ArrayDataset from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor, KerasClassifier +tf.compat.v1.disable_eager_execution() + @pytest.fixture def data(): diff --git a/tests/test_model.py b/tests/test_model.py index 8f7ee0d..b8fb8f1 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -10,14 +10,16 @@ from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier +import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Input from art.utils import check_and_transform_label_format - from art.utils import to_categorical +tf.compat.v1.disable_eager_execution() + def test_sklearn_classifier(): (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np() From 44a5d2e188f1398eed326668e33e2932a5cd4985 Mon Sep 17 00:00:00 2001 From: abigailgold <57357634+abigailgold@users.noreply.github.com> Date: Tue, 9 May 2023 13:30:28 +0300 Subject: [PATCH 05/11] Fix rtd docs (#75) * Fix issue building docs with new urllib3 * Try to fix rtd build Signed-off-by: abigailt --- .readthedocs.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 53a12db..d03da5e 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -12,7 +12,11 @@ sphinx: # Optionally build your docs in additional formats such as PDF # Optionally set the version of Python and requirements required to build your docs +build: + os: ubuntu-22.04 + tools: + python: "3.8" + python: - version: 3.8 install: - requirements: requirements.txt \ No newline at end of file From 710aae4083049a60e01a6a0357cde5b75ce451d0 Mon Sep 17 00:00:00 2001 From: abigailt Date: Tue, 16 May 2023 13:03:01 +0300 Subject: [PATCH 06/11] Initial commit. Tests not yet passing. Signed-off-by: abigailt --- apt/minimization/minimizer.py | 220 ++++++++++++++++++++++------------ tests/test_minimizer.py | 102 ++++++++++++++-- 2 files changed, 233 insertions(+), 89 deletions(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 65513b8..702b799 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -41,6 +41,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :param target_accuracy: The required relative accuracy when applying the base model to the generalized data. Accuracy is measured relative to the original accuracy of the model. :type target_accuracy: float, optional + :param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization process. + True means that the `transform` method is used to transform original data into + generalized data that is used for accuracy and NCP calculation. False indicates + that the `generalizations` structure should be used. Default is True. + :type generalize_using_transform: boolean, optional :param cells: The cells used to generalize records. Each cell must define a range or subset of categories for each feature, as well as a representative value for each feature. This parameter should be used when instantiating a transformer object without first fitting it. @@ -61,8 +66,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :type is_regression: boolean, optional """ - def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998, - cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None, + def __init__(self, estimator: Union[BaseEstimator, Model] = None, + target_accuracy: Optional[float] = 0.998, + generalize_using_transform: Optional[bool] = True, + cells: Optional[list] = None, + categorical_features: Optional[Union[np.ndarray, list]] = None, encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None, features_to_minimize: Optional[Union[np.ndarray, list]] = None, train_only_features_to_minimize: Optional[bool] = True, @@ -76,6 +84,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES) self.target_accuracy = target_accuracy self.cells = cells + if cells: + self._calculate_generalizations() self.categorical_features = [] if categorical_features: self.categorical_features = categorical_features @@ -83,6 +93,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.train_only_features_to_minimize = train_only_features_to_minimize self.is_regression = is_regression self.encoder = encoder + # self.generalize_using_transform = generalize_using_transform + self.generalize_using_transform = False + self._ncp = 0.0 + self._feature_data = {} + self._categorical_values = {} + self._dt = None + self._features = None + self._level = 0 def get_params(self, deep=True): """ @@ -99,6 +117,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM ret['features_to_minimize'] = self.features_to_minimize ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize ret['is_regression'] = self.is_regression + ret['generalize_using_transform'] = self.generalize_using_transform if deep: ret['cells'] = copy.deepcopy(self.cells) ret['estimator'] = self.estimator @@ -132,6 +151,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.is_regression = params['is_regression'] if 'cells' in params: self.cells = params['cells'] + if 'generalize_using_transform' in params: + self.generalize_using_transform = params['generalize_using_transform'] return self @property @@ -140,17 +161,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM Return the generalizations derived from the model and test data. :return: generalizations object. Contains 3 sections: 'ranges' that contains ranges for numerical features, - 'categories' that contains sub-groups of categories for categorical features, and - 'untouched' that contains the features that could not be generalized. + 'categories' that contains sub-groups of categories for categorical features, and + 'untouched' that contains the features that could not be generalized. """ return self._generalizations @property def ncp(self): """ - Return the NCP score of the generalizations. + Return the last calculated NCP score. NCP score is calculated upon calling `fit` (on the training data), + `transform' (on the test data) or when explicitly calling `calculate_ncp` and providing it a dataset. - :return: ncp score as float. + :return: NCP score as float. """ return self._ncp @@ -251,9 +273,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM used_X_test = X_test_QI # collect feature data (such as min, max) - feature_data = {} + self._feature_data = {} for feature in self._features: - if feature not in feature_data.keys(): + if feature not in self._feature_data.keys(): fd = {} values = list(x.loc[:, feature]) if feature not in self.categorical_features: @@ -262,7 +284,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM fd['range'] = max(values) - min(values) else: fd['range'] = len(np.unique(values)) - feature_data[feature] = fd + self._feature_data[feature] = fd # default encoder in case none provided if self.encoder is None: @@ -316,17 +338,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # if accuracy above threshold, improve generalization if accuracy > self.target_accuracy: print('Improving generalizations') - level = 1 + self._level = 1 while accuracy > self.target_accuracy: cells_previous_iter = self.cells generalization_prev_iter = self._generalizations cells_by_id_prev = self._cells_by_id - nodes = self._get_nodes_level(level) + nodes = self._get_nodes_level(self._level) try: - self._calculate_level_cells(level) + self._calculate_level_cells(self._level) except TypeError as e: print(e) + self._level -= 1 break self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes) @@ -340,10 +363,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.cells = cells_previous_iter self._generalizations = generalization_prev_iter self._cells_by_id = cells_by_id_prev + self._level -= 1 break else: - print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy)) - level += 1 + print('Pruned tree to level: %d, new relative accuracy: %f' % (self._level, accuracy)) + self._level += 1 # if accuracy below threshold, improve accuracy by removing features from generalization elif accuracy < self.target_accuracy: @@ -351,7 +375,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM while accuracy < self.target_accuracy: removed_feature = self._remove_feature_from_generalization(X_test, x_prepared_test, nodes, y_test, - feature_data, accuracy) + self._feature_data, accuracy) if removed_feature is None: break @@ -363,7 +387,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # self._cells currently holds the chosen generalization based on target accuracy # calculate iLoss - self._ncp = self._calculate_ncp(X_test, self._generalizations, feature_data) + self.calculate_ncp(X_test) # Return the transformer return self @@ -383,7 +407,66 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features) """ + transformed = self._inner_transform(X, features_names, dataset) + if not self.generalize_using_transform: + raise ValueError('transform method called even though generalize_using_transform parameter was False. This ' + 'can lead to inconsistent results.') + self.calculate_ncp(transformed, True) + return transformed + def calculate_ncp(self, samples: Optional[DATA_PANDAS_NUMPY_TYPE] = None, transformed: Optional[bool] = False): + """ + Compute the NCP score of the generalization. Calculation is based on the value of the + generalize_using_transform param. + Based on the NCP score presented in: Ghinita, G., Karras, P., Kalnis, P., Mamoulis, N.: Fast data anonymization + with low information loss (https://www.vldb.org/conf/2007/papers/research/p758-ghinita.pdf) + + :param samples: The input samples to compute the NCP score on. Ideally should be the data that will be + transformed (e.g., test/runtime data). If not samples supplied, will return the last NCP score + computed by the `fit` or `transform` method. + :type samples: {array-like, sparse matrix}, shape (n_samples, n_features), optional + :param transformed: Whether the supplied samples have already been transformed using the `transform` method. + Default is False. + :type transformed: boolean, optional + :return: NCP score as float. + """ + if samples is None: + return self._ncp + elif self.generalize_using_transform: + if not transformed: + # transform data + transformed_data = self._inner_transform(samples) + else: + transformed_data = samples + #TODO + else: # use generalizations + # suppressed features are already taken care of within _calc_ncp_numeric + ranges = self.generalizations['ranges'] + categories = self.generalizations['categories'] + range_counts = self._find_range_count(samples, ranges) + category_counts = self._find_categories_count(samples, categories) + + total = samples.shape[0] + total_ncp = 0 + total_features = len(self.generalizations['untouched']) + for feature in ranges.keys(): + feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], + self._feature_data[feature], total) + total_ncp = total_ncp + feature_ncp + total_features += 1 + for feature in categories.keys(): + feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature], + self._feature_data[feature], + total) + total_ncp = total_ncp + feature_ncp + total_features += 1 + if total_features == 0: + return 0 + self._ncp = total_ncp / total_features + return self._ncp + + def _inner_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None, + dataset: Optional[ArrayDataset] = None): # Check if fit has been called msg = 'This %(name)s instance is not initialized yet. ' \ 'Call ‘fit’ or ‘set_params’ with ' \ @@ -409,12 +492,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if not self._features: self._features = [i for i in range(x.shape[1])] - mapped = np.zeros(x.shape[0]) # to mark records we already mapped - all_indexes = [] - for i in range(len(self.cells)): - indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped) - all_indexes.append(indexes) - generalized = self._generalize_indexes(x, self.cells, all_indexes) + if self._dt: # only works if fit was called previously (but much more efficient) + nodes = self._get_nodes_level(self._level) + QI = x.loc[:, self.features_to_minimize] + used_x = x + if self.train_only_features_to_minimize: + used_x = QI + prepared = self._encode_categorical_features(used_x) + generalized = self._generalize(x, prepared, nodes, self.cells, self._cells_by_id) + else: + mapped = np.zeros(x.shape[0]) # to mark records we already mapped + all_indexes = [] + for i in range(len(self.cells)): + indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped) + all_indexes.append(indexes) + generalized = self._generalize_indexes(x, self.cells, all_indexes) if dataset and dataset.is_pandas: return generalized @@ -422,6 +514,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return generalized return generalized.to_numpy() + @staticmethod + def _calc_ncp_categorical(categories, category_count, feature_data, total): + category_sizes = [len(g) if len(g) > 1 else 0 for g in categories] + normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, category_count)] + average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes) + return average_group_size / feature_data['range'] # number of values in category + + @staticmethod + def _calc_ncp_numeric(feature_range, range_count, feature_data, total): + # if there are no ranges, feature is supressed and iLoss is 1 + if not feature_range: + return 1 + # range only contains the split values, need to add min and max value of feature + # to enable computing sizes of all ranges + new_range = [feature_data['min']] + feature_range + [feature_data['max']] + range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])] + normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)] + average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes) + return average_range_size / (feature_data['max'] - feature_data['min']) + def _get_record_indexes_for_cell(self, X, cell, mapped): indexes = [] for index, row in X.iterrows(): @@ -429,20 +541,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM indexes.append(index) return indexes - def _cell_contains(self, cell, x, i, mapped): + def _cell_contains(self, cell, x, index, mapped): for f in self._features: + i = self._features.index(f) if f in cell['ranges']: - if not self._cell_contains_numeric(f, cell['ranges'][f], x): + if not self._cell_contains_numeric(i, cell['ranges'][f], x): return False elif f in cell['categories']: - if not self._cell_contains_categorical(f, cell['categories'][f], x): + if not self._cell_contains_categorical(i, cell['categories'][f], x): return False elif f in cell['untouched']: continue else: raise TypeError("feature " + f + "not found in cell" + cell['id']) # Mark as mapped - mapped.itemset(i, 1) + mapped.itemset(index, 1) return True def _encode_categorical_features(self, X, save_mapping=False): @@ -476,8 +589,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._encoded_features = new_data.columns return new_data - def _cell_contains_numeric(self, f, range, x): - i = self._features.index(f) + @staticmethod + def _cell_contains_numeric(i, range, x): # convert x to ndarray to allow indexing a = np.array(x) value = a.item(i) @@ -489,8 +602,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return False return True - def _cell_contains_categorical(self, f, range, x): - i = self._features.index(f) + @staticmethod + def _cell_contains_categorical(i, range, x): # convert x to ndarray to allow indexing a = np.array(x) value = a.item(i) @@ -819,7 +932,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._remove_categorical_untouched(self._generalizations) def _find_range_count(self, samples, ranges): - samples_df = pd.DataFrame(samples, columns=self._encoded_features) + samples_df = pd.DataFrame(samples, columns=self._features) range_counts = {} last_value = None for r in ranges.keys(): @@ -844,31 +957,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM category_counts[c].append(len(samples.loc[samples[c].isin(value)])) return category_counts - def _calculate_ncp(self, samples, generalizations, feature_data): - # supressed features are already taken care of within _calc_ncp_numeric - ranges = generalizations['ranges'] - categories = generalizations['categories'] - range_counts = self._find_range_count(samples, ranges) - category_counts = self._find_categories_count(samples, categories) - - total = samples.shape[0] - total_ncp = 0 - total_features = len(generalizations['untouched']) - for feature in ranges.keys(): - feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], - feature_data[feature], total) - total_ncp = total_ncp + feature_ncp - total_features += 1 - for feature in categories.keys(): - featureNCP = self._calc_ncp_categorical(categories[feature], category_counts[feature], - feature_data[feature], - total) - total_ncp = total_ncp + featureNCP - total_features += 1 - if total_features == 0: - return 0 - return total_ncp / total_features - @staticmethod def _calculate_ranges(cells): ranges = {} @@ -942,26 +1030,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM untouched = untouched.intersection(*untouched_lists) return list(untouched) - @staticmethod - def _calc_ncp_categorical(categories, categoryCount, feature_data, total): - category_sizes = [len(g) if len(g) > 1 else 0 for g in categories] - normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, categoryCount)] - average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes) - return average_group_size / feature_data['range'] # number of values in category - - @staticmethod - def _calc_ncp_numeric(feature_range, range_count, feature_data, total): - # if there are no ranges, feature is supressed and iLoss is 1 - if not feature_range: - return 1 - # range only contains the split values, need to add min and max value of feature - # to enable computing sizes of all ranges - new_range = [feature_data['min']] + feature_range + [feature_data['max']] - range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])] - normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)] - average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes) - return average_range_size / (feature_data['max'] - feature_data['min']) - @staticmethod def _remove_feature_from_cells(cells, cells_by_id, feature): for cell in cells: diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index bd2f422..6cc5197 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -54,6 +54,33 @@ def test_minimizer_params(data): gen.transform(dataset=ArrayDataset(X, features_names=features)) +def test_minimizer_params_not_transform(data): + # Assume two features, age and height, and boolean label + cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": None, "end": 170}}, "label": 0, + 'categories': {}, "representative": {"age": 26, "height": 149}}, + {"id": 2, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": None, "end": 170}}, "label": 1, + 'categories': {}, "representative": {"age": 58, "height": 163}}, + {"id": 3, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": 171, "end": None}}, "label": 0, + 'categories': {}, "representative": {"age": 31, "height": 184}}, + {"id": 4, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": 171, "end": None}}, "label": 1, + 'categories': {}, "representative": {"age": 45, "height": 176}} + ] + features = ['age', 'height'] + X = np.array([[23, 165], + [45, 158], + [18, 190]]) + y = [1, 1, 0] + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(X, y)) + + gen = GeneralizeToRepresentative(model, cells=cells) + gen.calculate_ncp(X) + ncp = gen.ncp + assert (ncp > 0.0) + + def test_minimizer_fit(data): features = ['age', 'height'] X = np.array([[23, 165], @@ -101,13 +128,62 @@ def test_minimizer_fit(data): assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) +def test_minimizer_fit_not_transform(data): + features = ['age', 'height'] + X = np.array([[23, 165], + [45, 158], + [56, 123], + [67, 154], + [45, 149], + [42, 166], + [73, 172], + [94, 168], + [69, 175], + [24, 181], + [18, 190]]) + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(X, y)) + ad = ArrayDataset(X) + predictions = model.predict(ad) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + target_accuracy = 0.5 + gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy) + train_dataset = ArrayDataset(X, predictions, features_names=features) + + gen.fit(dataset=train_dataset) + gener = gen.generalizations + expected_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']} + + for key in expected_generalizations['ranges']: + assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expected_generalizations['categories']: + assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) + == set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expected_generalizations['untouched']) == set(gener['untouched'])) + modified_features = [f for f in features if + f in expected_generalizations['categories'].keys() or f in expected_generalizations[ + 'ranges'].keys()] + indexes = [] + for i in range(len(features)): + if features[i] in modified_features: + indexes.append(i) + + ncp = gen.ncp + if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: + assert (ncp > 0.0) + + def test_minimizer_fit_pandas(data): features = ['age', 'height', 'sex', 'ola'] X = [[23, 165, 'f', 'aa'], @@ -172,7 +248,7 @@ def test_minimizer_fit_pandas(data): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(X[modified_features])) is False) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) @@ -294,7 +370,7 @@ def test_minimizer_fit_QI(data): assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -370,7 +446,7 @@ def test_minimizer_fit_pandas_QI(data): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(X[modified_features])) is False) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) @@ -414,7 +490,7 @@ def test_minimize_ndarray_iris(): assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (x_train[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -492,7 +568,7 @@ def test_minimize_pandas_adult(): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(x_train[modified_features])) is False) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) @@ -570,7 +646,7 @@ def test_german_credit_pandas(): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(x_train[modified_features])) is False) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) @@ -644,7 +720,7 @@ def test_regression(): assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (x_train[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -698,7 +774,7 @@ def test_X_y(data): assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -752,7 +828,7 @@ def test_X_y_features_names(data): assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -826,7 +902,7 @@ def test_BaseEstimator_classification(data): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(X[modified_features])) is False) rel_accuracy = model.score(preprocessor.transform(transformed), predictions) @@ -899,7 +975,7 @@ def test_BaseEstimator_regression(): assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (x_train[indexes])).any()) rel_accuracy = model.score(transformed, predictions) @@ -940,7 +1016,7 @@ def test_keras_model(): assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_test, indexes, axis=1)).all()) ncp = gen.ncp if len(gener['ranges'].keys()) > 0 or len(gener['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) From 4541ee60a23f83e601d00d20eb79420fbbaa152b Mon Sep 17 00:00:00 2001 From: abigailt Date: Thu, 18 May 2023 10:32:54 +0300 Subject: [PATCH 07/11] generalize_using_transform=False supported Signed-off-by: abigailt --- apt/minimization/minimizer.py | 3 +-- tests/test_minimizer.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 702b799..1e15d3d 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -93,8 +93,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.train_only_features_to_minimize = train_only_features_to_minimize self.is_regression = is_regression self.encoder = encoder - # self.generalize_using_transform = generalize_using_transform - self.generalize_using_transform = False + self.generalize_using_transform = generalize_using_transform self._ncp = 0.0 self._feature_data = {} self._categorical_values = {} diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 6cc5197..8e5a6cc 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -75,7 +75,7 @@ def test_minimizer_params_not_transform(data): model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) model.fit(ArrayDataset(X, y)) - gen = GeneralizeToRepresentative(model, cells=cells) + gen = GeneralizeToRepresentative(model, cells=cells, generalize_using_transform=False) gen.calculate_ncp(X) ncp = gen.ncp assert (ncp > 0.0) @@ -158,7 +158,7 @@ def test_minimizer_fit_not_transform(data): if predictions.shape[1] > 1: predictions = np.argmax(predictions, axis=1) target_accuracy = 0.5 - gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy) + gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False) train_dataset = ArrayDataset(X, predictions, features_names=features) gen.fit(dataset=train_dataset) @@ -1043,3 +1043,32 @@ def test_untouched(): assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) == set([frozenset(sl) for sl in gener['categories'][key]])) assert (set(expected_generalizations['untouched']) == set(gener['untouched'])) + + +def test_errors(): + features = ['age', 'height'] + X = np.array([[23, 165], + [45, 158], + [56, 123], + [67, 154], + [45, 149], + [42, 166], + [73, 172], + [94, 168], + [69, 175], + [24, 181], + [18, 190]]) + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(X, y)) + ad = ArrayDataset(X) + predictions = model.predict(ad) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + gen = GeneralizeToRepresentative(model, generalize_using_transform=False) + train_dataset = ArrayDataset(X, predictions, features_names=features) + gen.fit(dataset=train_dataset) + with pytest.raises(ValueError): + gen.transform(X) From cc4cba0d8e2bd190772ded0be086fe38318d9b07 Mon Sep 17 00:00:00 2001 From: abigailt Date: Mon, 29 May 2023 19:13:35 +0300 Subject: [PATCH 08/11] Many fixes, some tests pass Signed-off-by: abigailt --- apt/minimization/minimizer.py | 227 +++++++++++++++++++++++++--------- tests/test_minimizer.py | 31 ++++- 2 files changed, 200 insertions(+), 58 deletions(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 1e15d3d..f5b1219 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -95,7 +95,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.encoder = encoder self.generalize_using_transform = generalize_using_transform self._ncp = 0.0 - self._feature_data = {} + self._feature_data = None self._categorical_values = {} self._dt = None self._features = None @@ -204,8 +204,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional :param y: The target values. This should contain the predictions of the original model on ``X``. :type y: array-like, shape (n_samples,), optional - :param features_names: The feature names, in the order that they appear in the data. Can be provided when - passing the data as ``X`` and ``y`` + :param features_names: The feature names, in the order that they appear in the data. Should be provided when + passing the data as ``X`` as a numpy array :type features_names: list of strings, optional :param dataset: Data wrapper containing the training input samples and the predictions of the original model on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both. @@ -272,18 +272,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM used_X_test = X_test_QI # collect feature data (such as min, max) - self._feature_data = {} - for feature in self._features: - if feature not in self._feature_data.keys(): - fd = {} - values = list(x.loc[:, feature]) - if feature not in self.categorical_features: - fd['min'] = min(values) - fd['max'] = max(values) - fd['range'] = max(values) - min(values) - else: - fd['range'] = len(np.unique(values)) - self._feature_data[feature] = fd + self._feature_data = self._get_feature_data(x) # default encoder in case none provided if self.encoder is None: @@ -386,7 +375,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # self._cells currently holds the chosen generalization based on target accuracy # calculate iLoss - self.calculate_ncp(X_test) + X_test_dataset = ArrayDataset(X_test, features_names=self._features) + self.calculate_ncp(X_test_dataset) # Return the transformer return self @@ -397,8 +387,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :param X: The training input samples. :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional - :param features_names: The feature names, in the order that they appear in the data. Can be provided when - passing the data as ``X`` and ``y`` + :param features_names: The feature names, in the order that they appear in the data. Should be provided when + passing the data as ``X`` as a numpy array :type features_names: list of strings, optional :param dataset: Data wrapper containing the training input samples and the predictions of the original model on the training data. Either ``X`` OR ``dataset`` need to be provided, not both. @@ -410,10 +400,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if not self.generalize_using_transform: raise ValueError('transform method called even though generalize_using_transform parameter was False. This ' 'can lead to inconsistent results.') - self.calculate_ncp(transformed, True) + transformed_dataset = ArrayDataset(transformed, features_names=self._features) + self.calculate_ncp(transformed_dataset, True) return transformed - def calculate_ncp(self, samples: Optional[DATA_PANDAS_NUMPY_TYPE] = None, transformed: Optional[bool] = False): + def calculate_ncp(self, samples: Optional[ArrayDataset] = None, transformed: Optional[bool] = False): """ Compute the NCP score of the generalization. Calculation is based on the value of the generalize_using_transform param. @@ -423,7 +414,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :param samples: The input samples to compute the NCP score on. Ideally should be the data that will be transformed (e.g., test/runtime data). If not samples supplied, will return the last NCP score computed by the `fit` or `transform` method. - :type samples: {array-like, sparse matrix}, shape (n_samples, n_features), optional + :type samples: ArrayDataset, optional. feature_names should be set. :param transformed: Whether the supplied samples have already been transformed using the `transform` method. Default is False. :type transformed: boolean, optional @@ -431,37 +422,50 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM """ if samples is None: return self._ncp - elif self.generalize_using_transform: + + samples_pd = pd.DataFrame(samples.get_samples(), columns=samples.features_names) + if self._features is None: + self._features = samples.features_names + if self._feature_data is None: + self._feature_data = self._get_feature_data(samples_pd) + + if self.generalize_using_transform: if not transformed: # transform data - transformed_data = self._inner_transform(samples) + transformed_data = self._inner_transform(dataset=samples) # can return numpy or pandas + if not samples.is_pandas: + transformed_data = pd.DataFrame(transformed_data, columns=samples.features_names) else: - transformed_data = samples - #TODO + transformed_data = samples_pd + range_counts, category_counts = self._calculate_transformed_generalizations(transformed_data) + generalizations = self._transformed_generalizations else: # use generalizations - # suppressed features are already taken care of within _calc_ncp_numeric - ranges = self.generalizations['ranges'] - categories = self.generalizations['categories'] - range_counts = self._find_range_count(samples, ranges) - category_counts = self._find_categories_count(samples, categories) + generalizations = self.generalizations + range_counts = self._find_range_counts(samples_pd, generalizations['ranges']) + category_counts = self._find_categories_counts(samples_pd, generalizations['categories']) - total = samples.shape[0] - total_ncp = 0 - total_features = len(self.generalizations['untouched']) - for feature in ranges.keys(): - feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], - self._feature_data[feature], total) - total_ncp = total_ncp + feature_ncp - total_features += 1 - for feature in categories.keys(): - feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature], - self._feature_data[feature], - total) - total_ncp = total_ncp + feature_ncp - total_features += 1 - if total_features == 0: - return 0 - self._ncp = total_ncp / total_features + # suppressed features are already taken care of within _calc_ncp_numeric + #TODO: check that this is the case for tramsformed as well + ranges = generalizations['ranges'] + categories = generalizations['categories'] + + total = samples_pd.shape[0] + total_ncp = 0 + total_features = len(generalizations['untouched']) + for feature in ranges.keys(): + feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], + self._feature_data[feature], total) + total_ncp = total_ncp + feature_ncp + total_features += 1 + for feature in categories.keys(): + feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature], + self._feature_data[feature], + total) + total_ncp = total_ncp + feature_ncp + total_features += 1 + if total_features == 0: + return 0 + self._ncp = total_ncp / total_features return self._ncp def _inner_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None, @@ -480,7 +484,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM elif dataset is None: raise ValueError('Either X OR dataset need to be provided, not both') if dataset and dataset.features_names: - self._features = dataset.features_names + if self._features is None: + self._features = dataset.features_names if dataset and dataset.get_samples() is not None: x = pd.DataFrame(dataset.get_samples(), columns=self._features) @@ -522,7 +527,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM @staticmethod def _calc_ncp_numeric(feature_range, range_count, feature_data, total): - # if there are no ranges, feature is supressed and iLoss is 1 + # if there are no ranges, feature is suppressed and iLoss is 1 if not feature_range: return 1 # range only contains the split values, need to add min and max value of feature @@ -533,6 +538,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes) return average_range_size / (feature_data['max'] - feature_data['min']) + + def _get_feature_data(self, x): + feature_data = {} + for feature in self._features: + if feature not in feature_data.keys(): + fd = {} + values = list(x.loc[:, feature]) + if feature not in self.categorical_features: + fd['min'] = min(values) + fd['max'] = max(values) + fd['range'] = max(values) - min(values) + else: + fd['range'] = len(np.unique(values)) + feature_data[feature] = fd + return feature_data + def _get_record_indexes_for_cell(self, X, cell, mapped): indexes = [] for index, row in X.iterrows(): @@ -868,12 +889,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # We want to remove features with low iLoss (NCP) and high accuracy gain # (after removing them) ranges = self._generalizations['ranges'] - range_counts = self._find_range_count(original_data, ranges) + range_counts = self._find_range_counts(original_data, ranges) total = prepared_data.size range_min = sys.float_info.max remove_feature = None categories = self.generalizations['categories'] - category_counts = self._find_categories_count(original_data, categories) + category_counts = self._find_categories_counts(original_data, categories) for feature in ranges.keys(): if feature not in self._generalizations['untouched']: @@ -930,25 +951,109 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)} self._remove_categorical_untouched(self._generalizations) - def _find_range_count(self, samples, ranges): - samples_df = pd.DataFrame(samples, columns=self._features) + def _calculate_transformed_generalizations(self, transformed): + # transformed data should only consist of representative values from cells (when removing untouched features) + ranges = {} + categories = {} + range_counts = {} + category_counts = {} + + unique_records = transformed.value_counts().reset_index(name='count') + representatives = unique_records.drop('count', axis=1) + representative_counts = unique_records['count'] # needed to normalize ncp according to quantity + index = 0 + for _, record in representatives.iterrows(): + # TODO: what if some cells are not present, we will not take their generalizations into account. We need to + # "gain" ncp in this case... + record_dict = self.pandas_record_to_dict(record) + for cell in self.cells: + representative = cell["representative"].copy() + record_copy = record_dict.copy() + if 'untouched' in cell: + for feature in cell['untouched']: + record_copy.pop(feature) + representative.pop(feature) + if record_copy == representative: + # handle numerical features + for feature in [key for key in cell['ranges'].keys() if + 'untouched' not in cell or key not in cell['untouched']]: + if feature not in ranges.keys(): + ranges[feature] = [] + if cell['ranges'][feature]['start'] is not None: + ranges[feature].append(cell['ranges'][feature]['start']) + if cell['ranges'][feature]['end'] is not None: + ranges[feature].append(cell['ranges'][feature]['end']) + if feature in range_counts: + range_counts[feature].append(representative_counts[index]) + else: + range_counts[feature] = [representative_counts[index]] + # handle categorical features + categorical_features_values = {} + for feature in [key for key in cell['categories'].keys() if + 'untouched' not in cell or key not in cell['untouched']]: + if feature not in categorical_features_values.keys(): + categorical_features_values[feature] = [] + for value in cell['categories'][feature]: + if value not in categorical_features_values[feature]: + categorical_features_values[feature].append(value) + for feature in categorical_features_values.keys(): + partitions = [] + values = categorical_features_values[feature] + assigned = [] + for i in range(len(values)): + value1 = values[i] + if value1 in assigned: + continue + partition = [value1] + assigned.append(value1) + for j in range(len(values)): + if j <= i: + continue + value2 = values[j] + if GeneralizeToRepresentative._are_inseparable(self.cells, feature, value1, value2): + partition.append(value2) + assigned.append(value2) + partitions.append(partition) + if feature in categories: + categories[feature].append(partitions) + else: + categories[feature] = [partitions] + if feature in category_counts: + category_counts[feature].append(representative_counts[index]) + else: + category_counts[feature] = [representative_counts[index]] + break + index += 1 + + for feature in ranges.keys(): + ranges[feature] = list(set(ranges[feature])) + ranges[feature].sort() + + self._transformed_generalizations = { + 'ranges': ranges, + 'categories': categories, + 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)} + self._remove_categorical_untouched(self._transformed_generalizations) + return range_counts, category_counts + + def _find_range_counts(self, samples, ranges): range_counts = {} last_value = None for r in ranges.keys(): range_counts[r] = [] # if empty list, all samples should be counted if not ranges[r]: - range_counts[r].append(samples_df.shape[0]) + range_counts[r].append(samples.shape[0]) else: for value in ranges[r]: - counter = [item for item in samples_df[r] if int(item) <= value] + counter = [item for item in samples[r] if int(item) <= value] range_counts[r].append(len(counter)) last_value = value - counter = [item for item in samples_df[r] if int(item) <= last_value] + counter = [item for item in samples[r] if int(item) <= last_value] range_counts[r].append(len(counter)) return range_counts - def _find_categories_count(self, samples, categories): + def _find_categories_counts(self, samples, categories): category_counts = {} for c in categories.keys(): category_counts[c] = [] @@ -1054,3 +1159,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM for feature in to_remove: del generalizations['categories'][feature] + + + @staticmethod + def pandas_record_to_dict(record): + dict = {} + for feature in record.index: + dict[feature] = record[feature] + return dict \ No newline at end of file diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 8e5a6cc..3b2f543 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -70,13 +70,42 @@ def test_minimizer_params_not_transform(data): [45, 158], [18, 190]]) y = [1, 1, 0] + samples = ArrayDataset(X, y, features) base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) model.fit(ArrayDataset(X, y)) gen = GeneralizeToRepresentative(model, cells=cells, generalize_using_transform=False) - gen.calculate_ncp(X) + gen.calculate_ncp(samples) + ncp = gen.ncp + assert (ncp > 0.0) + + +def test_minimizer_params_not_transform_no_data(data): + # Assume two features, age and height, and boolean label + cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": None, "end": 170}}, "label": 0, + 'categories': {}, "representative": {"age": 26, "height": 149}}, + {"id": 2, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": None, "end": 170}}, "label": 1, + 'categories': {}, "representative": {"age": 58, "height": 163}}, + {"id": 3, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": 171, "end": None}}, "label": 0, + 'categories': {}, "representative": {"age": 31, "height": 184}}, + {"id": 4, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": 171, "end": None}}, "label": 1, + 'categories': {}, "representative": {"age": 45, "height": 176}} + ] + features = ['age', 'height'] + X = np.array([[23, 165], + [45, 158], + [18, 190]]) + y = [1, 1, 0] + samples = ArrayDataset(X, y, features) + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(X, y)) + + gen = GeneralizeToRepresentative(model, cells=cells, generalize_using_transform=False) + gen.calculate_ncp(samples) ncp = gen.ncp assert (ncp > 0.0) From 26adcf3528aecd955b250abb0d828612fcd12b05 Mon Sep 17 00:00:00 2001 From: abigailt Date: Mon, 29 May 2023 19:15:16 +0300 Subject: [PATCH 09/11] All tests pass, still need to review TODOs Signed-off-by: abigailt --- apt/minimization/minimizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index f5b1219..5508c34 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -972,7 +972,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if 'untouched' in cell: for feature in cell['untouched']: record_copy.pop(feature) - representative.pop(feature) + if feature in representative: + representative.pop(feature) if record_copy == representative: # handle numerical features for feature in [key for key in cell['ranges'].keys() if From aa38a1d7162833b016b33b8b2b3c857b9a57d77d Mon Sep 17 00:00:00 2001 From: abigailt Date: Mon, 29 May 2023 21:27:01 +0300 Subject: [PATCH 10/11] Fix computing generalizations from transformed data + add some tests Signed-off-by: abigailt --- apt/minimization/minimizer.py | 203 +++++++++++++--------------------- tests/test_minimizer.py | 136 +++++++++++++++++++++++ 2 files changed, 213 insertions(+), 126 deletions(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 5508c34..928e7a9 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -178,7 +178,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None): """ - Learns the generalizations based on training data, and applies them to the data. + Learns the generalizations based on training data, and applies them to the data. Updates stored ncp value to the + one computed on the training data. :param X: The training input samples. :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional @@ -383,7 +384,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None): - """ Transforms data records to representative points. + """ Transforms data records to representative points. Updates stored ncp value to the one computed on the + transformed data. :param X: The training input samples. :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional @@ -407,7 +409,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def calculate_ncp(self, samples: Optional[ArrayDataset] = None, transformed: Optional[bool] = False): """ Compute the NCP score of the generalization. Calculation is based on the value of the - generalize_using_transform param. + generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the + provided data. If samples not provided, returns the last NCP score computed by the `fit` or `transform` method. + Based on the NCP score presented in: Ghinita, G., Karras, P., Kalnis, P., Mamoulis, N.: Fast data anonymization with low information loss (https://www.vldb.org/conf/2007/papers/research/p758-ghinita.pdf) @@ -423,13 +427,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if samples is None: return self._ncp + if not samples.features_names: + raise ValueError('features_names should be set in input ArrayDataset.') samples_pd = pd.DataFrame(samples.get_samples(), columns=samples.features_names) if self._features is None: self._features = samples.features_names if self._feature_data is None: self._feature_data = self._get_feature_data(samples_pd) + total_samples = samples_pd.shape[0] if self.generalize_using_transform: + # TODO: not sure I need to transform, data should be mapped to correct cell both with or without transforming if not transformed: # transform data transformed_data = self._inner_transform(dataset=samples) # can return numpy or pandas @@ -437,35 +445,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM transformed_data = pd.DataFrame(transformed_data, columns=samples.features_names) else: transformed_data = samples_pd - range_counts, category_counts = self._calculate_transformed_generalizations(transformed_data) - generalizations = self._transformed_generalizations + generalizations = self._calculate_transformed_generalizations(transformed_data) + # count how many transformed values are mapped to each cell + counted = np.zeros(transformed_data.shape[0]) # to mark records we already counted + ncp = 0 + for i in range(len(self.cells)): + cell = self.cells[i] + count = self._get_record_count_for_cell(transformed_data, cell, counted) + range_counts = {} + category_counts = {} + for feature in cell['ranges']: + range_counts[feature] = [count] + for feature in cell['categories']: + category_counts[feature] = [count] + ncp += self._calc_ncp_for_generalization(generalizations[cell['id']], range_counts, category_counts, + total_samples) + self._ncp = ncp else: # use generalizations generalizations = self.generalizations range_counts = self._find_range_counts(samples_pd, generalizations['ranges']) category_counts = self._find_categories_counts(samples_pd, generalizations['categories']) + self._ncp = self._calc_ncp_for_generalization(generalizations, range_counts, category_counts, total_samples) - # suppressed features are already taken care of within _calc_ncp_numeric - #TODO: check that this is the case for tramsformed as well - ranges = generalizations['ranges'] - categories = generalizations['categories'] - - total = samples_pd.shape[0] - total_ncp = 0 - total_features = len(generalizations['untouched']) - for feature in ranges.keys(): - feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], - self._feature_data[feature], total) - total_ncp = total_ncp + feature_ncp - total_features += 1 - for feature in categories.keys(): - feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature], - self._feature_data[feature], - total) - total_ncp = total_ncp + feature_ncp - total_features += 1 - if total_features == 0: - return 0 - self._ncp = total_ncp / total_features return self._ncp def _inner_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None, @@ -518,6 +519,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return generalized return generalized.to_numpy() + def _calc_ncp_for_generalization(self, generalization, range_counts, category_counts, total_count): + total_ncp = 0 + total_features = len(generalization['untouched']) + ranges = generalization['ranges'] + categories = generalization['categories'] + + # suppressed features are already taken care of within _calc_ncp_numeric + for feature in ranges.keys(): + feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], + self._feature_data[feature], total_count) + total_ncp = total_ncp + feature_ncp + total_features += 1 + for feature in categories.keys(): + feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature], + self._feature_data[feature], + total_count) + total_ncp = total_ncp + feature_ncp + total_features += 1 + if total_features == 0: + return 0 + return total_ncp / total_features + @staticmethod def _calc_ncp_categorical(categories, category_count, feature_data, total): category_sizes = [len(g) if len(g) > 1 else 0 for g in categories] @@ -538,7 +561,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes) return average_range_size / (feature_data['max'] - feature_data['min']) - def _get_feature_data(self, x): feature_data = {} for feature in self._features: @@ -561,6 +583,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM indexes.append(index) return indexes + def _get_record_count_for_cell(self, X, cell, mapped): + count = 0 + for index, row in X.iterrows(): + if not mapped.item(index) and self._cell_contains(cell, row, index, mapped): + count += 1 + return count + def _cell_contains(self, cell, x, index, mapped): for f in self._features: i = self._features.index(f) @@ -880,7 +909,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM current_accuracy) if feature is None: return None - GeneralizeToRepresentative._remove_feature_from_cells(self.cells, self._cells_by_id, feature) + self._remove_feature_from_cells(self.cells, self._cells_by_id, feature) return feature def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy): @@ -946,97 +975,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return remove_feature def _calculate_generalizations(self): - self._generalizations = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells), - 'categories': GeneralizeToRepresentative._calculate_categories(self.cells), - 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)} + self._generalizations = {'ranges': self._calculate_ranges(self.cells), + 'categories': self._calculate_categories(self.cells), + 'untouched': self._calculate_untouched(self.cells)} self._remove_categorical_untouched(self._generalizations) + def _calculate_generalizations_per_cell(self, cell): + generalizations = {'ranges': self._calculate_ranges([cell]), + 'categories': self._calculate_categories([cell]), + 'untouched': self._calculate_untouched([cell])} + self._remove_categorical_untouched(generalizations) + return generalizations + def _calculate_transformed_generalizations(self, transformed): - # transformed data should only consist of representative values from cells (when removing untouched features) - ranges = {} - categories = {} - range_counts = {} - category_counts = {} - - unique_records = transformed.value_counts().reset_index(name='count') - representatives = unique_records.drop('count', axis=1) - representative_counts = unique_records['count'] # needed to normalize ncp according to quantity - index = 0 - for _, record in representatives.iterrows(): - # TODO: what if some cells are not present, we will not take their generalizations into account. We need to - # "gain" ncp in this case... - record_dict = self.pandas_record_to_dict(record) - for cell in self.cells: - representative = cell["representative"].copy() - record_copy = record_dict.copy() - if 'untouched' in cell: - for feature in cell['untouched']: - record_copy.pop(feature) - if feature in representative: - representative.pop(feature) - if record_copy == representative: - # handle numerical features - for feature in [key for key in cell['ranges'].keys() if - 'untouched' not in cell or key not in cell['untouched']]: - if feature not in ranges.keys(): - ranges[feature] = [] - if cell['ranges'][feature]['start'] is not None: - ranges[feature].append(cell['ranges'][feature]['start']) - if cell['ranges'][feature]['end'] is not None: - ranges[feature].append(cell['ranges'][feature]['end']) - if feature in range_counts: - range_counts[feature].append(representative_counts[index]) - else: - range_counts[feature] = [representative_counts[index]] - # handle categorical features - categorical_features_values = {} - for feature in [key for key in cell['categories'].keys() if - 'untouched' not in cell or key not in cell['untouched']]: - if feature not in categorical_features_values.keys(): - categorical_features_values[feature] = [] - for value in cell['categories'][feature]: - if value not in categorical_features_values[feature]: - categorical_features_values[feature].append(value) - for feature in categorical_features_values.keys(): - partitions = [] - values = categorical_features_values[feature] - assigned = [] - for i in range(len(values)): - value1 = values[i] - if value1 in assigned: - continue - partition = [value1] - assigned.append(value1) - for j in range(len(values)): - if j <= i: - continue - value2 = values[j] - if GeneralizeToRepresentative._are_inseparable(self.cells, feature, value1, value2): - partition.append(value2) - assigned.append(value2) - partitions.append(partition) - if feature in categories: - categories[feature].append(partitions) - else: - categories[feature] = [partitions] - if feature in category_counts: - category_counts[feature].append(representative_counts[index]) - else: - category_counts[feature] = [representative_counts[index]] - break - index += 1 - - for feature in ranges.keys(): - ranges[feature] = list(set(ranges[feature])) - ranges[feature].sort() - - self._transformed_generalizations = { - 'ranges': ranges, - 'categories': categories, - 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)} - self._remove_categorical_untouched(self._transformed_generalizations) - return range_counts, category_counts + # calculate generalizations separately per cell + cell_generalizations = {} + for cell in self.cells: + cell_generalizations[cell['id']] = self._calculate_generalizations_per_cell(cell) + return cell_generalizations + @staticmethod def _find_range_counts(self, samples, ranges): range_counts = {} last_value = None @@ -1050,10 +1008,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM counter = [item for item in samples[r] if int(item) <= value] range_counts[r].append(len(counter)) last_value = value - counter = [item for item in samples[r] if int(item) <= last_value] + counter = [item for item in samples[r] if int(item) > last_value] range_counts[r].append(len(counter)) return range_counts + @staticmethod def _find_categories_counts(self, samples, categories): category_counts = {} for c in categories.keys(): @@ -1159,12 +1118,4 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM to_remove.append(feature) for feature in to_remove: - del generalizations['categories'][feature] - - - @staticmethod - def pandas_record_to_dict(record): - dict = {} - for feature in record.index: - dict[feature] = record[feature] - return dict \ No newline at end of file + del generalizations['categories'][feature] \ No newline at end of file diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 3b2f543..c6fe1d4 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -164,6 +164,138 @@ def test_minimizer_fit(data): assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) +def test_minimizer_ncp(data): + features = ['age', 'height'] + X = np.array([[23, 165], + [45, 158], + [56, 123], + [67, 154], + [45, 149], + [42, 166], + [73, 172], + [94, 168], + [69, 175], + [24, 181], + [18, 190]]) + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + X1 = np.array([[33, 165], + [43, 150], + [71, 143], + [92, 194], + [13, 125], + [22, 169]]) + + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(X, y)) + ad = ArrayDataset(X) + ad1 = ArrayDataset(X1, features_names=features) + predictions = model.predict(ad) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + target_accuracy = 0.4 + train_dataset = ArrayDataset(X, predictions, features_names=features) + + gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False) + gen1.fit(dataset=train_dataset) + ncp1 = gen1.ncp + gen1.calculate_ncp(ad1) + ncp2 = gen1.ncp + + gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy) + gen2.fit(dataset=train_dataset) + ncp3 = gen2.ncp + gen2.transform(dataset=ad1) + ncp4 = gen2.ncp + gen2.transform(dataset=ad) + ncp5 = gen2.ncp + gen2.transform(dataset=ad1) + ncp6 = gen2.ncp + + assert(ncp1 <= ncp3) + assert(ncp2 != ncp3) + assert(ncp3 != ncp4) + assert(ncp4 != ncp5) + assert(ncp6 == ncp4) + + +def test_minimizer_ncp_categorical(data): + features = ['age', 'height', 'sex', 'ola'] + X = [[23, 165, 'f', 'aa'], + [45, 158, 'f', 'aa'], + [56, 123, 'f', 'bb'], + [67, 154, 'm', 'aa'], + [45, 149, 'f', 'bb'], + [42, 166, 'm', 'bb'], + [73, 172, 'm', 'bb'], + [94, 168, 'f', 'aa'], + [69, 175, 'm', 'aa'], + [24, 181, 'm', 'bb'], + [18, 190, 'm', 'bb']] + X = pd.DataFrame(X, columns=features) + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + X1 = [[33, 165, 'f', 'aa'], + [43, 150, 'm', 'aa'], + [71, 143, 'f', 'aa'], + [92, 194, 'm', 'aa'], + [13, 125, 'f', 'aa'], + [22, 169, 'f', 'bb']] + X1 = pd.DataFrame(X1, columns=features) + + numeric_features = ["age", "height"] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + + categorical_features = ["sex", "ola"] + categorical_transformer = OneHotEncoder(handle_unknown="ignore") + + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(X) + encoded = pd.DataFrame(encoded) + + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(encoded, y)) + ad = ArrayDataset(X) + ad1 = ArrayDataset(X1) + predictions = model.predict(ArrayDataset(encoded)) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + target_accuracy = 0.4 + train_dataset = ArrayDataset(X, predictions, features_names=features) + + gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False, + categorical_features=categorical_features) + gen1.fit(dataset=train_dataset) + ncp1 = gen1.ncp + gen1.calculate_ncp(ad1) + ncp2 = gen1.ncp + + gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features) + gen2.fit(dataset=train_dataset) + ncp3 = gen2.ncp + gen2.transform(dataset=ad1) + ncp4 = gen2.ncp + gen2.transform(dataset=ad) + ncp5 = gen2.ncp + gen2.transform(dataset=ad1) + ncp6 = gen2.ncp + + assert(ncp1 <= ncp3) + assert(ncp2 != ncp3) + assert(ncp3 != ncp4) + assert(ncp4 != ncp5) + assert(ncp6 == ncp4) + + def test_minimizer_fit_not_transform(data): features = ['age', 'height'] X = np.array([[23, 165], @@ -1099,5 +1231,9 @@ def test_errors(): gen = GeneralizeToRepresentative(model, generalize_using_transform=False) train_dataset = ArrayDataset(X, predictions, features_names=features) gen.fit(dataset=train_dataset) + with pytest.raises(ValueError): gen.transform(X) + + with pytest.raises(ValueError): + gen.calculate_ncp(ad) From f1995ea6f90e923a90bb104c65ff08c9513de249 Mon Sep 17 00:00:00 2001 From: abigailt Date: Mon, 29 May 2023 21:34:25 +0300 Subject: [PATCH 11/11] Removed redundant transforming of data Signed-off-by: abigailt --- apt/minimization/minimizer.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 928e7a9..09b7099 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -403,10 +403,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM raise ValueError('transform method called even though generalize_using_transform parameter was False. This ' 'can lead to inconsistent results.') transformed_dataset = ArrayDataset(transformed, features_names=self._features) - self.calculate_ncp(transformed_dataset, True) + self.calculate_ncp(transformed_dataset) return transformed - def calculate_ncp(self, samples: Optional[ArrayDataset] = None, transformed: Optional[bool] = False): + def calculate_ncp(self, samples: Optional[ArrayDataset] = None): """ Compute the NCP score of the generalization. Calculation is based on the value of the generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the @@ -419,9 +419,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM transformed (e.g., test/runtime data). If not samples supplied, will return the last NCP score computed by the `fit` or `transform` method. :type samples: ArrayDataset, optional. feature_names should be set. - :param transformed: Whether the supplied samples have already been transformed using the `transform` method. - Default is False. - :type transformed: boolean, optional :return: NCP score as float. """ if samples is None: @@ -437,21 +434,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM total_samples = samples_pd.shape[0] if self.generalize_using_transform: - # TODO: not sure I need to transform, data should be mapped to correct cell both with or without transforming - if not transformed: - # transform data - transformed_data = self._inner_transform(dataset=samples) # can return numpy or pandas - if not samples.is_pandas: - transformed_data = pd.DataFrame(transformed_data, columns=samples.features_names) - else: - transformed_data = samples_pd - generalizations = self._calculate_transformed_generalizations(transformed_data) - # count how many transformed values are mapped to each cell - counted = np.zeros(transformed_data.shape[0]) # to mark records we already counted + generalizations = self._calculate_cell_generalizations() + # count how many records are mapped to each cell + counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted ncp = 0 for i in range(len(self.cells)): cell = self.cells[i] - count = self._get_record_count_for_cell(transformed_data, cell, counted) + count = self._get_record_count_for_cell(samples_pd, cell, counted) range_counts = {} category_counts = {} for feature in cell['ranges']: @@ -987,7 +976,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._remove_categorical_untouched(generalizations) return generalizations - def _calculate_transformed_generalizations(self, transformed): + def _calculate_cell_generalizations(self): # calculate generalizations separately per cell cell_generalizations = {} for cell in self.cells: @@ -995,7 +984,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return cell_generalizations @staticmethod - def _find_range_counts(self, samples, ranges): + def _find_range_counts(samples, ranges): range_counts = {} last_value = None for r in ranges.keys(): @@ -1013,7 +1002,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return range_counts @staticmethod - def _find_categories_counts(self, samples, categories): + def _find_categories_counts(samples, categories): category_counts = {} for c in categories.keys(): category_counts[c] = []