mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-24 20:36:21 +02:00
Fix error with pandas dataframes (#92)
* Fix error with pandas dataframes in _columns_different_distributions + add appropriate test * Update documentation of classes to reflect that all data should be encoded and scaled. --------- Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
cb70ca10e6
commit
e00535d120
6 changed files with 28 additions and 30 deletions
|
|
@ -5,7 +5,6 @@ import numpy as np
|
|||
from scipy import stats
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from tqdm import tqdm
|
||||
from pandas.api.types import is_numeric_dtype, is_categorical_dtype
|
||||
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
|
||||
|
|
@ -169,26 +168,15 @@ class KNNAttackStrategyUtils(AttackStrategyUtils):
|
|||
differing_columns = []
|
||||
df1_samples = df1.get_samples()
|
||||
df2_samples = df2.get_samples()
|
||||
if df1.is_pandas:
|
||||
for name, _ in df1_samples.items():
|
||||
is_categorical = name in categorical_features or is_categorical_dtype(df1_samples.dtypes[name])
|
||||
is_numeric = is_numeric_dtype(df1_samples.dtypes[name])
|
||||
KNNAttackStrategyUtils._column_statistical_test(df1_samples[name], df2_samples[name], name,
|
||||
is_categorical, is_numeric,
|
||||
self.distribution_comparison_numeric_test,
|
||||
self.distribution_comparison_categorical_test,
|
||||
self.distribution_comparison_alpha,
|
||||
differing_columns)
|
||||
else:
|
||||
is_numeric = np.issubdtype(df1_samples.dtype, int) or np.issubdtype(df1_samples.dtype, float)
|
||||
is_numeric = np.issubdtype(df1_samples.dtype, int) or np.issubdtype(df1_samples.dtype, float)
|
||||
|
||||
for i, column in enumerate(df1_samples.T):
|
||||
is_categorical = i in categorical_features
|
||||
KNNAttackStrategyUtils._column_statistical_test(df1_samples[:, i], df2_samples[:, i], i,
|
||||
is_categorical, is_numeric,
|
||||
self.distribution_comparison_numeric_test,
|
||||
self.distribution_comparison_categorical_test,
|
||||
self.distribution_comparison_alpha, differing_columns)
|
||||
for i, column in enumerate(df1_samples.T):
|
||||
is_categorical = i in categorical_features
|
||||
KNNAttackStrategyUtils._column_statistical_test(df1_samples[:, i], df2_samples[:, i], i,
|
||||
is_categorical, is_numeric,
|
||||
self.distribution_comparison_numeric_test,
|
||||
self.distribution_comparison_categorical_test,
|
||||
self.distribution_comparison_alpha, differing_columns)
|
||||
return differing_columns
|
||||
|
||||
def validate_distributions(self, original_data_members: ArrayDataset, original_data_non_members: ArrayDataset,
|
||||
|
|
|
|||
|
|
@ -47,7 +47,8 @@ class DatasetAssessmentManager:
|
|||
synthetic_data: ArrayDataset, dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = [])\
|
||||
-> list[DatasetAttackScore]:
|
||||
"""
|
||||
Do dataset privacy risk assessment by running dataset attacks, and return their scores.
|
||||
Do dataset privacy risk assessment by running dataset attacks, and return their scores. All data is assumed
|
||||
to be encoded and scaled.
|
||||
|
||||
:param original_data_members: A container for the training original samples and labels,
|
||||
only samples are used in the assessment
|
||||
|
|
|
|||
|
|
@ -71,9 +71,11 @@ class DatasetAttackMembershipClassification(DatasetAttackMembership):
|
|||
config: DatasetAttackConfigMembershipClassification = DatasetAttackConfigMembershipClassification(),
|
||||
dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = None):
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels
|
||||
:param original_data_non_members: A container for the holdout original samples and labels
|
||||
:param synthetic_data: A container for the synthetic samples and labels
|
||||
:param original_data_members: A container for the training original samples and labels. Should be encoded and
|
||||
scaled.
|
||||
:param original_data_non_members: A container for the holdout original samples and labels. Should be encoded
|
||||
and scaled.
|
||||
:param synthetic_data: A container for the synthetic samples and labels. Should be encoded and scaled.
|
||||
:param config: Configuration parameters to guide the attack, optional
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -84,9 +84,11 @@ class DatasetAttackMembershipKnnProbabilities(DatasetAttackMembership):
|
|||
dataset_name: str = DEFAULT_DATASET_NAME,
|
||||
categorical_features: list = None, **kwargs):
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels
|
||||
:param original_data_non_members: A container for the holdout original samples and labels
|
||||
:param synthetic_data: A container for the synthetic samples and labels
|
||||
:param original_data_members: A container for the training original samples and labels. Should be encoded and
|
||||
scaled.
|
||||
:param original_data_non_members: A container for the holdout original samples and labels. Should be encoded and
|
||||
scaled.
|
||||
:param synthetic_data: A container for the synthetic samples and labels. Should be encoded and scaled.
|
||||
:param config: Configuration parameters to guide the attack, optional
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -77,9 +77,11 @@ class DatasetAttackWholeDatasetKnnDistance(DatasetAttack):
|
|||
config: DatasetAttackConfigWholeDatasetKnnDistance = DatasetAttackConfigWholeDatasetKnnDistance(),
|
||||
dataset_name: str = DEFAULT_DATASET_NAME, categorical_features: list = None, **kwargs):
|
||||
"""
|
||||
:param original_data_members: A container for the training original samples and labels
|
||||
:param original_data_non_members: A container for the holdout original samples and labels
|
||||
:param synthetic_data: A container for the synthetic samples and labels
|
||||
:param original_data_members: A container for the training original samples and labels. Should be encoded and
|
||||
scaled.
|
||||
:param original_data_non_members: A container for the holdout original samples and labels. Should be encoded
|
||||
and scaled.
|
||||
:param synthetic_data: A container for the synthetic samples and labels. Should be encoded and scaled.
|
||||
:param config: Configuration parameters to guide the assessment process, optional
|
||||
:param dataset_name: A name to identify this dataset, optional
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from apt.anonymization import Anonymize
|
||||
|
|
@ -52,6 +53,8 @@ def test_risk_anonymization(name, data, dataset_type, mgr):
|
|||
categorical_features = []
|
||||
elif "nursery" in name:
|
||||
preprocessed_x_train, preprocessed_x_test, categorical_features = preprocess_nursery_x_data(x_train, x_test)
|
||||
preprocessed_x_train = pd.DataFrame(preprocessed_x_train)
|
||||
preprocessed_x_test = pd.DataFrame(preprocessed_x_test)
|
||||
QI = list(range(15, 20))
|
||||
anonymizer = Anonymize(ANON_K, QI, train_only_QI=True)
|
||||
else:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue