mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
Flake code cleanups
Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
parent
ad65f6f993
commit
0ee0bf05d6
4 changed files with 33 additions and 28 deletions
|
|
@ -131,14 +131,19 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
|
|||
def is_categorical(col_name):
|
||||
col_name in categorical_features or is_categorical_dtype(df1_samples.dtypes[col_name])
|
||||
|
||||
def is_numeric(col_name): is_numeric_dtype(df1_samples.dtypes[col_name])
|
||||
def is_numeric(col_name):
|
||||
is_numeric_dtype(df1_samples.dtypes[col_name])
|
||||
|
||||
for name, _ in df1_samples.items():
|
||||
KNNAttackStrategyUtils._column_statistical_test(df1_samples[name], df2_samples[name], name,
|
||||
is_categorical, is_numeric(df1_samples.dtypes[name]),
|
||||
test_type, alpha, differing_columns)
|
||||
else:
|
||||
is_df1_numeric_dtype = np.issubdtype(df1_samples.dtype, int) or np.issubdtype(df1_samples.dtype, float)
|
||||
def is_categorical(col_name): col_name in categorical_features
|
||||
|
||||
def is_categorical(col_name):
|
||||
col_name in categorical_features
|
||||
|
||||
for i, column in enumerate(df1_samples.T):
|
||||
KNNAttackStrategyUtils._column_statistical_test(df1_samples[:, i], df2_samples[:, i], i,
|
||||
is_categorical, is_df1_numeric_dtype, test_type, alpha,
|
||||
|
|
|
|||
|
|
@ -134,7 +134,7 @@ class DatasetAttackMembershipClassification(DatasetAttackMembership):
|
|||
predict_proba = classifier.predict_proba(test_x)
|
||||
return roc_auc_score(test_labels, predict_proba[:, 1])
|
||||
|
||||
def calculate_privacy_score(self, member_roc_auc: float, non_member_roc_auc: float) ->(
|
||||
def calculate_privacy_score(self, member_roc_auc: float, non_member_roc_auc: float) -> (
|
||||
DatasetAttackScoreMembershipClassification):
|
||||
"""
|
||||
Compare the distinguishability of the synthetic data from the members dataset (training)
|
||||
|
|
@ -144,7 +144,7 @@ class DatasetAttackMembershipClassification(DatasetAttackMembership):
|
|||
score, baseline_score = member_roc_auc, non_member_roc_auc
|
||||
|
||||
if 0 < baseline_score <= score:
|
||||
normalized_ratio = score/baseline_score - 1.0
|
||||
normalized_ratio = score / baseline_score - 1.0
|
||||
else:
|
||||
normalized_ratio = 0
|
||||
|
||||
|
|
|
|||
|
|
@ -6,8 +6,6 @@ https://doi.org/10.1145/3372297.3417238 and its implementation in https://github
|
|||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Callable
|
||||
import os.path
|
||||
from math import floor
|
||||
|
||||
import numpy as np
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
|
|
@ -119,14 +117,15 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
|
|||
# X_non_members = np.genfromtxt(test_filename, delimiter=",")
|
||||
# else:
|
||||
x_synth_ref = self.generate_synth_data(len(X_reference), n_components=10, original_data=X_reference)
|
||||
# np.savetxt(ref_filename, x_synth_ref, delimiter=",")
|
||||
# np.savetxt(test_filename, X_non_members, delimiter=",")
|
||||
# np.savetxt(ref_filename, x_synth_ref, delimiter=",")
|
||||
# np.savetxt(test_filename, X_non_members, delimiter=",")
|
||||
|
||||
self.original_data_non_members = ArrayDataset(X_non_members)
|
||||
self.synthetic_data_ref = ArrayDataset(x_synth_ref)
|
||||
if config.compute_distance:
|
||||
self.knn_learner_ref = NearestNeighbors(n_neighbors=config.k, algorithm='auto', metric=config.compute_distance,
|
||||
metric_params=config.distance_params)
|
||||
self.knn_learner_ref = NearestNeighbors(n_neighbors=config.k, algorithm='auto',
|
||||
metric=config.compute_distance,
|
||||
metric_params=config.distance_params)
|
||||
else:
|
||||
self.knn_learner_ref = NearestNeighbors(n_neighbors=config.k, algorithm='auto')
|
||||
|
||||
|
|
@ -166,26 +165,30 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
|
|||
self.attack_strategy_utils.fit(self.knn_learner_ref, self.synthetic_data_ref)
|
||||
|
||||
# members query
|
||||
member_distances_ref = self.attack_strategy_utils.find_knn(self.knn_learner_ref, self.original_data_members)
|
||||
member_distances_ref = self.attack_strategy_utils.find_knn(self.knn_learner_ref,
|
||||
self.original_data_members)
|
||||
|
||||
# non-members query
|
||||
non_member_distances_ref = self.attack_strategy_utils.find_knn(self.knn_learner_ref, self.original_data_non_members)
|
||||
non_member_distances_ref = self.attack_strategy_utils.find_knn(self.knn_learner_ref,
|
||||
self.original_data_non_members)
|
||||
|
||||
assert (len(member_distances) == len(member_distances_ref))
|
||||
assert (len(non_member_distances) == len(non_member_distances_ref))
|
||||
num_pos_samples = len(member_distances)
|
||||
num_neg_samples = len(non_member_distances)
|
||||
|
||||
member_proba_calibrate = self.probability_per_sample(member_distances[:num_pos_samples] - member_distances_ref[:num_pos_samples])
|
||||
non_member_proba_calibrate = self.probability_per_sample(non_member_distances[:num_neg_samples] - non_member_distances_ref[:num_neg_samples])
|
||||
member_proba_calibrate = self.probability_per_sample(member_distances[:num_pos_samples] -
|
||||
member_distances_ref[:num_pos_samples])
|
||||
non_member_proba_calibrate = self.probability_per_sample(non_member_distances[:num_neg_samples] -
|
||||
non_member_distances_ref[:num_neg_samples])
|
||||
|
||||
result = DatasetAttackResultMembership(member_probabilities=member_proba_calibrate,
|
||||
non_member_probabilities=non_member_proba_calibrate)
|
||||
non_member_probabilities=non_member_proba_calibrate)
|
||||
else:
|
||||
member_proba = self.probability_per_sample(member_distances)
|
||||
non_member_proba = self.probability_per_sample(non_member_distances)
|
||||
result = DatasetAttackResultMembership(member_probabilities=member_proba,
|
||||
non_member_probabilities=non_member_proba)
|
||||
non_member_probabilities=non_member_proba)
|
||||
|
||||
score = self.calculate_privacy_score(result, self.config.generate_plot)
|
||||
score.distributions_validation_result = distributions_validation_result
|
||||
|
|
|
|||
|
|
@ -13,8 +13,7 @@ from apt.risk.data_assessment.dataset_assessment_manager import DatasetAssessmen
|
|||
from apt.utils.dataset_utils import get_iris_dataset_np, get_diabetes_dataset_np, get_adult_dataset_pd, \
|
||||
get_nursery_dataset_pd
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
from data_assessment.dataset_attack_membership_classification import DatasetAttackConfigMembershipClassification, \
|
||||
DatasetAttackMembershipClassification, DatasetAttackScoreMembershipClassification
|
||||
from data_assessment.dataset_attack_membership_classification import DatasetAttackScoreMembershipClassification
|
||||
from data_assessment.dataset_attack_membership_knn_probabilities import DatasetAttackScoreMembershipKnnProbabilities
|
||||
from data_assessment.dataset_attack_whole_dataset_knn_distance import DatasetAttackScoreWholeDatasetKnnDistance
|
||||
|
||||
|
|
@ -22,7 +21,6 @@ MIN_SHARE = 0.5
|
|||
MIN_ROC_AUC = 0.0
|
||||
MIN_PRECISION = 0.0
|
||||
|
||||
NUM_SYNTH_SAMPLES = 100
|
||||
NUM_SYNTH_COMPONENTS = 4
|
||||
|
||||
iris_dataset_np = get_iris_dataset_np()
|
||||
|
|
@ -99,13 +97,13 @@ def test_risk_kde(name, data, dataset_type, mgr):
|
|||
else:
|
||||
raise ValueError('Pandas dataset missing a preprocessing step')
|
||||
|
||||
num_synth_samples = x_train.shape[0] # required by the chi test
|
||||
synth_data = ArrayDataset(
|
||||
kde(x_train.shape[0], n_components=num_synth_components, original_data=encoded))
|
||||
# kde(NUM_SYNTH_SAMPLES, n_components=num_synth_components, original_data=encoded))
|
||||
kde(num_synth_samples, n_components=num_synth_components, original_data=encoded))
|
||||
original_data_members = ArrayDataset(encoded, y_train)
|
||||
original_data_non_members = ArrayDataset(encoded_test, y_test)
|
||||
|
||||
dataset_name = 'kde' + str(NUM_SYNTH_SAMPLES) + name
|
||||
dataset_name = 'kde' + str(num_synth_samples) + name
|
||||
assess_privacy_and_validate_result(mgr, original_data_members, original_data_non_members, synth_data, dataset_name,
|
||||
categorical_features)
|
||||
|
||||
|
|
@ -185,7 +183,6 @@ def filter_categorical(feature_names, return_feature_names: bool = True):
|
|||
return list(np.flatnonzero(np.char.startswith(feature_name_strs, 'cat__')))
|
||||
|
||||
|
||||
|
||||
def assess_privacy_and_validate_result(dataset_assessment_manager, original_data_members, original_data_non_members,
|
||||
synth_data, dataset_name, categorical_features):
|
||||
attack_scores = dataset_assessment_manager.assess(original_data_members, original_data_non_members, synth_data,
|
||||
|
|
@ -194,12 +191,12 @@ def assess_privacy_and_validate_result(dataset_assessment_manager, original_data
|
|||
for i, (assessment_type, scores) in enumerate(attack_scores.items()):
|
||||
if assessment_type == 'MembershipKnnProbabilities':
|
||||
score_g: DatasetAttackScoreMembershipKnnProbabilities = scores[0]
|
||||
assert(score_g.roc_auc_score > MIN_ROC_AUC)
|
||||
assert(score_g.average_precision_score > MIN_PRECISION)
|
||||
assert score_g.roc_auc_score > MIN_ROC_AUC
|
||||
assert score_g.average_precision_score > MIN_PRECISION
|
||||
elif assessment_type == 'WholeDatasetKnnDistance':
|
||||
score_h: DatasetAttackScoreWholeDatasetKnnDistance = scores[0]
|
||||
assert(score_h.share > MIN_SHARE)
|
||||
assert score_h.share > MIN_SHARE
|
||||
if assessment_type == 'MembershipClassification':
|
||||
score_mc: DatasetAttackScoreMembershipClassification = scores[0]
|
||||
assert(score_mc.synthetic_data_quality_warning is False)
|
||||
assert (0 <= score_mc.normalized_ratio <= 1)
|
||||
assert score_mc.synthetic_data_quality_warning is False
|
||||
assert 0 <= score_mc.normalized_ratio <= 1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue