diff --git a/apt/risk/data_assessment/attack_strategy_utils.py b/apt/risk/data_assessment/attack_strategy_utils.py index af12628..77bfc78 100644 --- a/apt/risk/data_assessment/attack_strategy_utils.py +++ b/apt/risk/data_assessment/attack_strategy_utils.py @@ -5,7 +5,6 @@ import numpy as np from scipy import stats from sklearn.neighbors import NearestNeighbors from tqdm import tqdm -from pandas.api.types import is_numeric_dtype, is_categorical_dtype from apt.utils.datasets import ArrayDataset @@ -169,26 +168,15 @@ class KNNAttackStrategyUtils(AttackStrategyUtils): differing_columns = [] df1_samples = df1.get_samples() df2_samples = df2.get_samples() - if df1.is_pandas: - for name, _ in df1_samples.items(): - is_categorical = name in categorical_features or is_categorical_dtype(df1_samples.dtypes[name]) - is_numeric = is_numeric_dtype(df1_samples.dtypes[name]) - KNNAttackStrategyUtils._column_statistical_test(df1_samples[name], df2_samples[name], name, - is_categorical, is_numeric, - self.distribution_comparison_numeric_test, - self.distribution_comparison_categorical_test, - self.distribution_comparison_alpha, - differing_columns) - else: - is_numeric = np.issubdtype(df1_samples.dtype, int) or np.issubdtype(df1_samples.dtype, float) + is_numeric = np.issubdtype(df1_samples.dtype, int) or np.issubdtype(df1_samples.dtype, float) - for i, column in enumerate(df1_samples.T): - is_categorical = i in categorical_features - KNNAttackStrategyUtils._column_statistical_test(df1_samples[:, i], df2_samples[:, i], i, - is_categorical, is_numeric, - self.distribution_comparison_numeric_test, - self.distribution_comparison_categorical_test, - self.distribution_comparison_alpha, differing_columns) + for i, column in enumerate(df1_samples.T): + is_categorical = i in categorical_features + KNNAttackStrategyUtils._column_statistical_test(df1_samples[:, i], df2_samples[:, i], i, + is_categorical, is_numeric, + self.distribution_comparison_numeric_test, + self.distribution_comparison_categorical_test, + self.distribution_comparison_alpha, differing_columns) return differing_columns def validate_distributions(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset, diff --git a/apt/risk/data_assessment/dataset_assessment_manager.py b/apt/risk/data_assessment/dataset_assessment_manager.py index 4d699bc..a83c0e4 100644 --- a/apt/risk/data_assessment/dataset_assessment_manager.py +++ b/apt/risk/data_assessment/dataset_assessment_manager.py @@ -47,7 +47,8 @@ class DatasetAssessmentManager: synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = [])\ -> list[DatasetAttackScore]: """ - Do dataset privacy risk assessment by running dataset attacks, and return their scores. + Do dataset privacy risk assessment by running dataset attacks, and return their scores. All data is assumed + to be encoded and scaled. :param original_data_members: A container for the training original samples and labels, only samples are used in the assessment diff --git a/apt/risk/data_assessment/dataset_attack_membership_classification.py b/apt/risk/data_assessment/dataset_attack_membership_classification.py index 1da4aa1..abfab21 100644 --- a/apt/risk/data_assessment/dataset_attack_membership_classification.py +++ b/apt/risk/data_assessment/dataset_attack_membership_classification.py @@ -71,9 +71,11 @@ class DatasetAttackMembershipClassification(DatasetAttackMembership): config: DatasetAttackConfigMembershipClassification = DatasetAttackConfigMembershipClassification(), dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = None): """ - :param original_data_members: A container for the training original samples and labels - :param original_data_non_members: A container for the holdout original samples and labels - :param synthetic_data: A container for the synthetic samples and labels + :param original_data_members: A container for the training original samples and labels. Should be encoded and + scaled. + :param original_data_non_members: A container for the holdout original samples and labels. Should be encoded + and scaled. + :param synthetic_data: A container for the synthetic samples and labels. Should be encoded and scaled. :param config: Configuration parameters to guide the attack, optional :param dataset_name: A name to identify this dataset, optional """ diff --git a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py index 8b3abe9..21f46c6 100644 --- a/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py +++ b/apt/risk/data_assessment/dataset_attack_membership_knn_probabilities.py @@ -84,9 +84,11 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership): dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = None, **kwargs): """ - :param original_data_members: A container for the training original samples and labels - :param original_data_non_members: A container for the holdout original samples and labels - :param synthetic_data: A container for the synthetic samples and labels + :param original_data_members: A container for the training original samples and labels. Should be encoded and + scaled. + :param original_data_non_members: A container for the holdout original samples and labels. Should be encoded and + scaled. + :param synthetic_data: A container for the synthetic samples and labels. Should be encoded and scaled. :param config: Configuration parameters to guide the attack, optional :param dataset_name: A name to identify this dataset, optional """ diff --git a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py index 2eb9aeb..2c24a3f 100644 --- a/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py +++ b/apt/risk/data_assessment/dataset_attack_whole_dataset_knn_distance.py @@ -77,9 +77,11 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack): config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(), dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = None, **kwargs): """ - :param original_data_members: A container for the training original samples and labels - :param original_data_non_members: A container for the holdout original samples and labels - :param synthetic_data: A container for the synthetic samples and labels + :param original_data_members: A container for the training original samples and labels. Should be encoded and + scaled. + :param original_data_non_members: A container for the holdout original samples and labels. Should be encoded + and scaled. + :param synthetic_data: A container for the synthetic samples and labels. Should be encoded and scaled. :param config: Configuration parameters to guide the assessment process, optional :param dataset_name: A name to identify this dataset, optional """ diff --git a/tests/test_data_assessment_short_test.py b/tests/test_data_assessment_short_test.py index 7416d47..1ca11d5 100644 --- a/tests/test_data_assessment_short_test.py +++ b/tests/test_data_assessment_short_test.py @@ -1,3 +1,4 @@ +import pandas as pd import pytest from apt.anonymization import Anonymize @@ -52,6 +53,8 @@ def test_risk_anonymization(name, data, dataset_type, mgr): categorical_features = [] elif "nursery" in name: preprocessed_x_train, preprocessed_x_test, categorical_features = preprocess_nursery_x_data(x_train, x_test) + preprocessed_x_train = pd.DataFrame(preprocessed_x_train) + preprocessed_x_test = pd.DataFrame(preprocessed_x_test) QI = list(range(15, 20)) anonymizer = Anonymize(ANON_K, QI, train_only_QI=True) else: