mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-26 05:16:22 +02:00
Add column distribution comparison, and a third method for dataset asssessment by membership classification (#84)
* Add column distribution comparison, and a third method for dataset assessment by membership classification * Address review comments, add additional distribution comparison tests and make them externally configurable too, in addition to the alpha becoming configurable. Signed-off-by: Maya Anderson <mayaa@il.ibm.com>
This commit is contained in:
parent
13a0567183
commit
a40484e0c9
8 changed files with 676 additions and 205 deletions
|
|
@ -1,5 +1,7 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
|
@ -11,34 +13,39 @@ from apt.risk.data_assessment.dataset_attack_result import DatasetAttackScore, D
|
|||
from apt.risk.data_assessment.dataset_attack_whole_dataset_knn_distance import \
|
||||
DatasetAttackConfigWholeDatasetKnnDistance, DatasetAttackWholeDatasetKnnDistance
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
from apt.risk.data_assessment.dataset_attack_membership_classification import \
|
||||
DatasetAttackConfigMembershipClassification, DatasetAttackMembershipClassification
|
||||
|
||||
|
||||
@dataclass
|
||||
class DatasetAssessmentManagerConfig:
|
||||
"""
|
||||
Configuration for DatasetAssessmentManager.
|
||||
|
||||
:param persist_reports: Whether to save assessment results to filesystem.
|
||||
:param generate_plots: Whether to generate and visualize plots as part of assessment.
|
||||
:param persist_reports: save assessment results to filesystem, or not.
|
||||
:param timestamp_reports: if persist_reports is True, then define if create a separate report for each timestamp,
|
||||
or append to the same reports
|
||||
:param generate_plots: generate and visualize plots as part of assessment, or not..
|
||||
"""
|
||||
persist_reports: bool = False
|
||||
timestamp_reports: bool = False
|
||||
generate_plots: bool = False
|
||||
|
||||
|
||||
class DatasetAssessmentManager:
|
||||
"""
|
||||
The main class for running dataset assessment attacks.
|
||||
|
||||
:param config: Configuration parameters to guide the dataset assessment process
|
||||
"""
|
||||
attack_scores_per_record_knn_probabilities: list[DatasetAttackScore] = []
|
||||
attack_scores_whole_dataset_knn_distance: list[DatasetAttackScore] = []
|
||||
attack_scores = defaultdict(list)
|
||||
|
||||
def __init__(self, config: Optional[DatasetAssessmentManagerConfig] = DatasetAssessmentManagerConfig) -> None:
|
||||
"""
|
||||
:param config: Configuration parameters to guide the dataset assessment process
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
def assess(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME) -> list[DatasetAttackScore]:
|
||||
synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = [])\
|
||||
-> list[DatasetAttackScore]:
|
||||
"""
|
||||
Do dataset privacy risk assessment by running dataset attacks, and return their scores.
|
||||
|
||||
|
|
@ -48,41 +55,54 @@ class DatasetAssessmentManager:
|
|||
only samples are used in the assessment
|
||||
:param synthetic_data: A container for the synthetic samples and labels, only samples are used in the assessment
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
:param categorical_features: A list of categorical feature names or numbers
|
||||
|
||||
:return:
|
||||
a list of dataset attack risk scores
|
||||
"""
|
||||
# Create attacks
|
||||
config_gl = DatasetAttackConfigMembershipKnnProbabilities(use_batches=False,
|
||||
generate_plot=self.config.generate_plots)
|
||||
attack_gl = DatasetAttackMembershipKnnProbabilities(original_data_members,
|
||||
original_data_non_members,
|
||||
synthetic_data,
|
||||
config_gl,
|
||||
dataset_name)
|
||||
|
||||
score_gl = attack_gl.assess_privacy()
|
||||
self.attack_scores_per_record_knn_probabilities.append(score_gl)
|
||||
dataset_name, categorical_features)
|
||||
|
||||
config_h = DatasetAttackConfigWholeDatasetKnnDistance(use_batches=False)
|
||||
attack_h = DatasetAttackWholeDatasetKnnDistance(original_data_members, original_data_non_members,
|
||||
synthetic_data, config_h, dataset_name)
|
||||
synthetic_data, config_h, dataset_name, categorical_features)
|
||||
|
||||
score_h = attack_h.assess_privacy()
|
||||
self.attack_scores_whole_dataset_knn_distance.append(score_h)
|
||||
return [score_gl, score_h]
|
||||
config_mc = DatasetAttackConfigMembershipClassification(classifier_type='LogisticRegression',
|
||||
# 'RandomForestClassifier',
|
||||
threshold=0.9)
|
||||
attack_mc = DatasetAttackMembershipClassification(original_data_members, original_data_non_members,
|
||||
synthetic_data, config_mc, dataset_name)
|
||||
|
||||
attack_list = [
|
||||
(attack_gl, attack_gl.short_name()), # "MembershipKnnProbabilities"
|
||||
(attack_h, attack_h.short_name()), # "WholeDatasetKnnDistance"
|
||||
(attack_mc, attack_mc.short_name()), # "MembershipClassification"
|
||||
]
|
||||
|
||||
for i, (attack, attack_name) in enumerate(attack_list):
|
||||
print(f"Running {attack_name} attack on {dataset_name}")
|
||||
score = attack.assess_privacy()
|
||||
self.attack_scores[attack_name].append(score)
|
||||
|
||||
return self.attack_scores
|
||||
|
||||
def dump_all_scores_to_files(self):
|
||||
"""
|
||||
Save assessment results to filesystem.
|
||||
"""
|
||||
Save assessment results to filesystem.
|
||||
"""
|
||||
if self.config.persist_reports:
|
||||
results_log_file = "_results.log.csv"
|
||||
self._dump_scores_to_file(self.attack_scores_per_record_knn_probabilities,
|
||||
"per_record_knn_probabilities" + results_log_file, True)
|
||||
self._dump_scores_to_file(self.attack_scores_whole_dataset_knn_distance,
|
||||
"whole_dataset_knn_distance" + results_log_file, True)
|
||||
|
||||
@staticmethod
|
||||
def _dump_scores_to_file(attack_scores: list[DatasetAttackScore], filename: str, header: bool):
|
||||
run_results_df = pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore') # don't serialize result
|
||||
run_results_df.to_csv(filename, header=header, encoding='utf-8', index=False, mode='w') # Overwrite
|
||||
time_str = time.strftime("%Y%m%d-%H%M%S")
|
||||
for i, (attack_name, attack_scores) in enumerate(self.attack_scores.items()):
|
||||
if self.config.timestamp_reports:
|
||||
results_log_file = f"{time_str}_{attack_name}_results.log.csv"
|
||||
else:
|
||||
results_log_file = f"{attack_name}_results.log.csv"
|
||||
run_results_df = (pd.DataFrame(attack_scores).drop('result', axis=1, errors='ignore').
|
||||
drop('distributions_validation_result', axis=1, errors='ignore'))
|
||||
run_results_df.to_csv(results_log_file, header=True, encoding='utf-8', index=False, mode='w')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue