Fix error with pandas dataframes (#92)

* Fix error with pandas dataframes in _columns_different_distributions + add appropriate test
* Update documentation of classes to reflect that all data should be encoded and scaled.

---------

Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
abigailgold 2024-02-13 08:56:12 -05:00 committed by GitHub
parent cb70ca10e6
commit e00535d120
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 28 additions and 30 deletions

View file

@ -5,7 +5,6 @@ import numpy as np
from scipy import stats
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
from pandas.api.types import is_numeric_dtype, is_categorical_dtype
from apt.utils.datasets import ArrayDataset
@ -169,26 +168,15 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
differing_columns = []
df1_samples = df1.get_samples()
df2_samples = df2.get_samples()
if df1.is_pandas:
for name, _ in df1_samples.items():
is_categorical = name in categorical_features or is_categorical_dtype(df1_samples.dtypes[name])
is_numeric = is_numeric_dtype(df1_samples.dtypes[name])
KNNAttackStrategyUtils._column_statistical_test(df1_samples[name], df2_samples[name], name,
is_categorical, is_numeric,
self.distribution_comparison_numeric_test,
self.distribution_comparison_categorical_test,
self.distribution_comparison_alpha,
differing_columns)
else:
is_numeric = np.issubdtype(df1_samples.dtype, int) or np.issubdtype(df1_samples.dtype, float)
is_numeric = np.issubdtype(df1_samples.dtype, int) or np.issubdtype(df1_samples.dtype, float)
for i, column in enumerate(df1_samples.T):
is_categorical = i in categorical_features
KNNAttackStrategyUtils._column_statistical_test(df1_samples[:, i], df2_samples[:, i], i,
is_categorical, is_numeric,
self.distribution_comparison_numeric_test,
self.distribution_comparison_categorical_test,
self.distribution_comparison_alpha, differing_columns)
for i, column in enumerate(df1_samples.T):
is_categorical = i in categorical_features
KNNAttackStrategyUtils._column_statistical_test(df1_samples[:, i], df2_samples[:, i], i,
is_categorical, is_numeric,
self.distribution_comparison_numeric_test,
self.distribution_comparison_categorical_test,
self.distribution_comparison_alpha, differing_columns)
return differing_columns
def validate_distributions(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,

View file

@ -47,7 +47,8 @@ class DatasetAssessmentManager:
synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = [])\
-> list[DatasetAttackScore]:
"""
Do dataset privacy risk assessment by running dataset attacks, and return their scores.
Do dataset privacy risk assessment by running dataset attacks, and return their scores. All data is assumed
to be encoded and scaled.
:param original_data_members: A container for the training original samples and labels,
only samples are used in the assessment

View file

@ -71,9 +71,11 @@ class DatasetAttackMembershipClassification(DatasetAttackMembership):
config: DatasetAttackConfigMembershipClassification = DatasetAttackConfigMembershipClassification(),
dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = None):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
:param synthetic_data: A container for the synthetic samples and labels
:param original_data_members: A container for the training original samples and labels. Should be encoded and
scaled.
:param original_data_non_members: A container for the holdout original samples and labels. Should be encoded
and scaled.
:param synthetic_data: A container for the synthetic samples and labels. Should be encoded and scaled.
:param config: Configuration parameters to guide the attack, optional
:param dataset_name: A name to identify this dataset, optional
"""

View file

@ -84,9 +84,11 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
dataset_name: str = DEFAULT_DATASET_NAME,
categorical_features: list = None, **kwargs):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
:param synthetic_data: A container for the synthetic samples and labels
:param original_data_members: A container for the training original samples and labels. Should be encoded and
scaled.
:param original_data_non_members: A container for the holdout original samples and labels. Should be encoded and
scaled.
:param synthetic_data: A container for the synthetic samples and labels. Should be encoded and scaled.
:param config: Configuration parameters to guide the attack, optional
:param dataset_name: A name to identify this dataset, optional
"""

View file

@ -77,9 +77,11 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(),
dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = None, **kwargs):
"""
:param original_data_members: A container for the training original samples and labels
:param original_data_non_members: A container for the holdout original samples and labels
:param synthetic_data: A container for the synthetic samples and labels
:param original_data_members: A container for the training original samples and labels. Should be encoded and
scaled.
:param original_data_non_members: A container for the holdout original samples and labels. Should be encoded
and scaled.
:param synthetic_data: A container for the synthetic samples and labels. Should be encoded and scaled.
:param config: Configuration parameters to guide the assessment process, optional
:param dataset_name: A name to identify this dataset, optional
"""

View file

@ -1,3 +1,4 @@
import pandas as pd
import pytest
from apt.anonymization import Anonymize
@ -52,6 +53,8 @@ def test_risk_anonymization(name, data, dataset_type, mgr):
categorical_features = []
elif "nursery" in name:
preprocessed_x_train, preprocessed_x_test, categorical_features = preprocess_nursery_x_data(x_train, x_test)
preprocessed_x_train = pd.DataFrame(preprocessed_x_train)
preprocessed_x_test = pd.DataFrame(preprocessed_x_test)
QI = list(range(15, 20))
anonymizer = Anonymize(ANON_K, QI, train_only_QI=True)
else: