diff --git a/apt/risk/data_assessment/attack_strategy_utils.py b/apt/risk/data_assessment/attack_strategy_utils.py index 477a727..17c44f9 100644 --- a/apt/risk/data_assessment/attack_strategy_utils.py +++ b/apt/risk/data_assessment/attack_strategy_utils.py @@ -131,14 +131,19 @@ class KNNAttackStrategyUtils(AttackStrategyUtils): def is_categorical(col_name): col_name in categorical_features or is_categorical_dtype(df1_samples.dtypes[col_name]) - def is_numeric(col_name): is_numeric_dtype(df1_samples.dtypes[col_name]) + def is_numeric(col_name): + is_numeric_dtype(df1_samples.dtypes[col_name]) + for name, _ in df1_samples.items(): KNNAttackStrategyUtils._column_statistical_test(df1_samples[name], df2_samples[name], name, is_categorical, is_numeric(df1_samples.dtypes[name]), test_type, alpha, differing_columns) else: is_df1_numeric_dtype = np.issubdtype(df1_samples.dtype, int) or np.issubdtype(df1_samples.dtype, float) - def is_categorical(col_name): col_name in categorical_features + + def is_categorical(col_name): + col_name in categorical_features + for i, column in enumerate(df1_samples.T): KNNAttackStrategyUtils._column_statistical_test(df1_samples[:, i], df2_samples[:, i], i, is_categorical, is_df1_numeric_dtype, test_type, alpha, diff --git a/apt/risk/data_assessment/dataset_attack_membership_classification.py b/apt/risk/data_assessment/dataset_attack_membership_classification.py index ca86af2..670b4e8 100644 --- a/apt/risk/data_assessment/dataset_attack_membership_classification.py +++ b/apt/risk/data_assessment/dataset_attack_membership_classification.py @@ -134,7 +134,7 @@ class DatasetAttackMembershipClassification(DatasetAttackMembership): predict_proba = classifier.predict_proba(test_x) return roc_auc_score(test_labels, predict_proba[:, 1]) - def calculate_privacy_score(self, member_roc_auc: float, non_member_roc_auc: float) ->( + def calculate_privacy_score(self, member_roc_auc: float, non_member_roc_auc: float) -> ( DatasetAttackScoreMembershipClassification): """ Compare the distinguishability of the synthetic data from the members dataset (training) @@ -144,7 +144,7 @@ class DatasetAttackMembershipClassification(DatasetAttackMembership): score, baseline_score = member_roc_auc, non_member_roc_auc if 0 < baseline_score <= score: - normalized_ratio = score/baseline_score - 1.0 + normalized_ratio = score / baseline_score - 1.0 else: normalized_ratio = 0 diff --git a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py index e902521..0f9bd88 100644 --- a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py +++ b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py @@ -6,8 +6,6 @@ https://doi.org/10.1145/3372297.3417238 and its implementation in https://github """ from dataclasses import dataclass from typing import Callable -import os.path -from math import floor import numpy as np from sklearn.neighbors import NearestNeighbors @@ -119,14 +117,15 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): # X_non_members = np.genfromtxt(test_filename, delimiter=",") # else: x_synth_ref = self.generate_synth_data(len(X_reference), n_components=10, original_data=X_reference) - # np.savetxt(ref_filename, x_synth_ref, delimiter=",") - # np.savetxt(test_filename, X_non_members, delimiter=",") + # np.savetxt(ref_filename, x_synth_ref, delimiter=",") + # np.savetxt(test_filename, X_non_members, delimiter=",") self.original_data_non_members = ArrayDataset(X_non_members) self.synthetic_data_ref = ArrayDataset(x_synth_ref) if config.compute_distance: - self.knn_learner_ref = NearestNeighbors(n_neighbors=config.k, algorithm='auto', metric=config.compute_distance, - metric_params=config.distance_params) + self.knn_learner_ref = NearestNeighbors(n_neighbors=config.k, algorithm='auto', + metric=config.compute_distance, + metric_params=config.distance_params) else: self.knn_learner_ref = NearestNeighbors(n_neighbors=config.k, algorithm='auto') @@ -166,26 +165,30 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): self.attack_strategy_utils.fit(self.knn_learner_ref, self.synthetic_data_ref) # members query - member_distances_ref = self.attack_strategy_utils.find_knn(self.knn_learner_ref, self.original_data_members) + member_distances_ref = self.attack_strategy_utils.find_knn(self.knn_learner_ref, + self.original_data_members) # non-members query - non_member_distances_ref = self.attack_strategy_utils.find_knn(self.knn_learner_ref, self.original_data_non_members) + non_member_distances_ref = self.attack_strategy_utils.find_knn(self.knn_learner_ref, + self.original_data_non_members) assert (len(member_distances) == len(member_distances_ref)) assert (len(non_member_distances) == len(non_member_distances_ref)) num_pos_samples = len(member_distances) num_neg_samples = len(non_member_distances) - member_proba_calibrate = self.probability_per_sample(member_distances[:num_pos_samples] - member_distances_ref[:num_pos_samples]) - non_member_proba_calibrate = self.probability_per_sample(non_member_distances[:num_neg_samples] - non_member_distances_ref[:num_neg_samples]) + member_proba_calibrate = self.probability_per_sample(member_distances[:num_pos_samples] - + member_distances_ref[:num_pos_samples]) + non_member_proba_calibrate = self.probability_per_sample(non_member_distances[:num_neg_samples] - + non_member_distances_ref[:num_neg_samples]) result = DatasetAttackResultMembership(member_probabilities=member_proba_calibrate, - non_member_probabilities=non_member_proba_calibrate) + non_member_probabilities=non_member_proba_calibrate) else: member_proba = self.probability_per_sample(member_distances) non_member_proba = self.probability_per_sample(non_member_distances) result = DatasetAttackResultMembership(member_probabilities=member_proba, - non_member_probabilities=non_member_proba) + non_member_probabilities=non_member_proba) score = self.calculate_privacy_score(result, self.config.generate_plot) score.distributions_validation_result = distributions_validation_result diff --git a/tests/test_data_assessment.py b/tests/test_data_assessment.py index 2128a78..3a69758 100644 --- a/tests/test_data_assessment.py +++ b/tests/test_data_assessment.py @@ -13,8 +13,7 @@ from apt.risk.data_assessment.dataset_assessment_manager import DatasetAssessmen from apt.utils.dataset_utils import get_iris_dataset_np, get_diabetes_dataset_np, get_adult_dataset_pd, \ get_nursery_dataset_pd from apt.utils.datasets import ArrayDataset -from data_assessment.dataset_attack_membership_classification import DatasetAttackConfigMembershipClassification, \ - DatasetAttackMembershipClassification, DatasetAttackScoreMembershipClassification +from data_assessment.dataset_attack_membership_classification import DatasetAttackScoreMembershipClassification from data_assessment.dataset_attack_membership_knn_probabilities import DatasetAttackScoreMembershipKnnProbabilities from data_assessment.dataset_attack_whole_dataset_knn_distance import DatasetAttackScoreWholeDatasetKnnDistance @@ -22,7 +21,6 @@ MIN_SHARE = 0.5 MIN_ROC_AUC = 0.0 MIN_PRECISION = 0.0 -NUM_SYNTH_SAMPLES = 100 NUM_SYNTH_COMPONENTS = 4 iris_dataset_np = get_iris_dataset_np() @@ -99,13 +97,13 @@ def test_risk_kde(name, data, dataset_type, mgr): else: raise ValueError('Pandas dataset missing a preprocessing step') + num_synth_samples = x_train.shape[0] # required by the chi test synth_data = ArrayDataset( - kde(x_train.shape[0], n_components=num_synth_components, original_data=encoded)) - # kde(NUM_SYNTH_SAMPLES, n_components=num_synth_components, original_data=encoded)) + kde(num_synth_samples, n_components=num_synth_components, original_data=encoded)) original_data_members = ArrayDataset(encoded, y_train) original_data_non_members = ArrayDataset(encoded_test, y_test) - dataset_name = 'kde' + str(NUM_SYNTH_SAMPLES) + name + dataset_name = 'kde' + str(num_synth_samples) + name assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, synth_data, dataset_name, categorical_features) @@ -185,7 +183,6 @@ def filter_categorical(feature_names, return_feature_names: bool = True): return list(np.flatnonzero(np.char.startswith(feature_name_strs, 'cat__'))) - def assess_privacy_and_validate_result(dataset_assessment_manager, original_data_members, original_data_non_members, synth_data, dataset_name, categorical_features): attack_scores = dataset_assessment_manager.assess(original_data_members, original_data_non_members, synth_data, @@ -194,12 +191,12 @@ def assess_privacy_and_validate_result(dataset_assessment_manager, original_data for i, (assessment_type, scores) in enumerate(attack_scores.items()): if assessment_type == 'MembershipKnnProbabilities': score_g: DatasetAttackScoreMembershipKnnProbabilities = scores[0] - assert(score_g.roc_auc_score > MIN_ROC_AUC) - assert(score_g.average_precision_score > MIN_PRECISION) + assert score_g.roc_auc_score > MIN_ROC_AUC + assert score_g.average_precision_score > MIN_PRECISION elif assessment_type == 'WholeDatasetKnnDistance': score_h: DatasetAttackScoreWholeDatasetKnnDistance = scores[0] - assert(score_h.share > MIN_SHARE) + assert score_h.share > MIN_SHARE if assessment_type == 'MembershipClassification': score_mc: DatasetAttackScoreMembershipClassification = scores[0] - assert(score_mc.synthetic_data_quality_warning is False) - assert (0 <= score_mc.normalized_ratio <= 1) + assert score_mc.synthetic_data_quality_warning is False + assert 0 <= score_mc.normalized_ratio <= 1