Make data minimization more consistent and performant (#83)

* Update requirements

* Update incompatible scipy version

* Reduce runtime of dataset assessment tests

* ncp is now a class that contains 3 values: fit_score, transform_score and generalizations_score so that it doesn't matter in what order the different methods are called, all calculated ncp scores are stored.
Generalizations can now be applied either from tree cells or from global generalizations struct depending on the value of generalize_using_transform. Representative values can also be computed from global generalizations.
Removing a feature from the generalization can also be applied in either mode.

* Compute generalizations with test data when possible (for computing better representatives).

* Externalize common test code to methods.
This commit is contained in:
abigailgold 2023-08-21 18:39:15 +03:00 committed by GitHub
parent e9a225501f
commit 13a0567183
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 1004 additions and 689 deletions

View file

@ -1,6 +1,5 @@
import numpy as np
import pandas as pd
from scipy.spatial import distance
from collections import Counter
from sklearn.compose import ColumnTransformer
@ -146,7 +145,8 @@ class Anonymize:
min_value = max(values)
min_dist = float("inf")
for value in values:
dist = distance.euclidean(value, median)
# euclidean distance between two floating point values
dist = abs(value - median)
if dist < min_dist:
min_dist = dist
min_value = value

View file

@ -2,6 +2,8 @@
This module implements all classes needed to perform data minimization
"""
from typing import Union, Optional
from dataclasses import dataclass
from collections import Counter
import pandas as pd
import numpy as np
import copy
@ -20,6 +22,13 @@ from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnClassifier
@dataclass
class NCPScores:
fit_score: float = None
transform_score: float = None
generalizations_score: float = None
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
"""
A transformer that generalizes data to representative points.
@ -59,14 +68,23 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
Default is False.
:type is_regression: boolean, optional
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization
process. True means that the `transform` method is used to transform original
data into generalized data that is used for accuracy and NCP calculation.
False indicates that the `generalizations` structure should be used.
Default is True.
:type generalize_using_transform: boolean, optional
"""
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998,
cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None,
def __init__(self, estimator: Union[BaseEstimator, Model] = None,
target_accuracy: Optional[float] = 0.998,
cells: Optional[list] = None,
categorical_features: Optional[Union[np.ndarray, list]] = None,
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
train_only_features_to_minimize: Optional[bool] = True,
is_regression: Optional[bool] = False):
is_regression: Optional[bool] = False,
generalize_using_transform: bool = True):
self.estimator = estimator
if estimator is not None and not issubclass(estimator.__class__, Model):
@ -76,6 +94,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
self.target_accuracy = target_accuracy
self.cells = cells
if cells:
self._calculate_generalizations()
self.categorical_features = []
if categorical_features:
self.categorical_features = categorical_features
@ -83,6 +103,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.train_only_features_to_minimize = train_only_features_to_minimize
self.is_regression = is_regression
self.encoder = encoder
self.generalize_using_transform = generalize_using_transform
self._ncp_scores = NCPScores()
self._feature_data = None
self._categorical_values = {}
self._dt = None
self._features = None
self._level = 0
def get_params(self, deep=True):
"""
@ -99,12 +126,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
ret['features_to_minimize'] = self.features_to_minimize
ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
ret['is_regression'] = self.is_regression
if deep:
ret['cells'] = copy.deepcopy(self.cells)
ret['estimator'] = self.estimator
ret['encoder'] = self.encoder
if deep:
ret['cells'] = copy.deepcopy(self.cells)
else:
ret['cells'] = copy.copy(self.cells)
return ret
def set_params(self, **params):
@ -132,6 +160,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.is_regression = params['is_regression']
if 'cells' in params:
self.cells = params['cells']
if 'estimator' in params:
self.estimator = params['estimator']
if 'encoder' in params:
self.encoder = params['encoder']
return self
@property
@ -148,16 +180,19 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
@property
def ncp(self):
"""
Return the NCP score of the generalizations.
Return the last calculated NCP scores. NCP score is calculated upon calling `fit` (on the training data),
`transform' (on the test data) or when explicitly calling `calculate_ncp` and providing it a dataset.
:return: ncp score as float.
:return: NCPScores object, that contains a score corresponding to the last fit call, one for the last
transform call, and a score based on global generalizations.
"""
return self._ncp
return self._ncp_scores
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
"""
Learns the generalizations based on training data, and applies them to the data.
Learns the generalizations based on training data, and applies them to the data. Also sets the fit_score,
transform_score and generalizations_score in self.ncp.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -172,19 +207,23 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
"""
if not self.generalize_using_transform:
raise ValueError('fit_transform method called even though generalize_using_transform parameter was False. '
'This can lead to inconsistent results.')
self.fit(X, y, features_names, dataset=dataset)
return self.transform(X, features_names, dataset=dataset)
def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional = None, dataset: ArrayDataset = None):
"""Learns the generalizations based on training data.
"""Learns the generalizations based on training data. Also sets the fit_score and generalizations_score in
self.ncp.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
:param y: The target values. This should contain the predictions of the original model on ``X``.
:type y: array-like, shape (n_samples,), optional
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
passing the data as ``X`` and ``y``
:param features_names: The feature names, in the order that they appear in the data. Should be provided when
passing the data as ``X`` as a numpy array
:type features_names: list of strings, optional
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
@ -223,46 +262,35 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.features_to_minimize = [str(i) for i in self.features_to_minimize]
if not all(elem in self._features for elem in self.features_to_minimize):
raise ValueError('features to minimize should be a subset of features names')
x_QI = x.loc[:, self.features_to_minimize]
x_qi = x.loc[:, self.features_to_minimize]
# divide dataset into train and test
used_data = x
if self.train_only_features_to_minimize:
used_data = x_QI
used_data = x_qi
if self.is_regression:
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
x_train, x_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
random_state=14)
else:
try:
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(),
x_train, x_test, y_train, y_test = train_test_split(x, dataset.get_labels(),
stratify=dataset.get_labels(), test_size=0.4,
random_state=18)
except ValueError:
print('Could not stratify split due to uncommon class value, doing unstratified split instead')
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
x_train, x_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
random_state=18)
X_train_QI = X_train.loc[:, self.features_to_minimize]
X_test_QI = X_test.loc[:, self.features_to_minimize]
used_X_train = X_train
used_X_test = X_test
x_train_qi = x_train.loc[:, self.features_to_minimize]
x_test_qi = x_test.loc[:, self.features_to_minimize]
used_x_train = x_train
used_x_test = x_test
if self.train_only_features_to_minimize:
used_X_train = X_train_QI
used_X_test = X_test_QI
used_x_train = x_train_qi
used_x_test = x_test_qi
# collect feature data (such as min, max)
feature_data = {}
for feature in self._features:
if feature not in feature_data.keys():
fd = {}
values = list(x.loc[:, feature])
if feature not in self.categorical_features:
fd['min'] = min(values)
fd['max'] = max(values)
fd['range'] = max(values) - min(values)
else:
fd['range'] = len(np.unique(values))
feature_data[feature] = fd
self._feature_data = self._get_feature_data(x)
# default encoder in case none provided
if self.encoder is None:
@ -290,9 +318,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# prepare data for DT
self._encode_categorical_features(used_data, save_mapping=True)
x_prepared = self._encode_categorical_features(used_X_train)
x_prepared = self._encode_categorical_features(used_x_train)
self._dt.fit(x_prepared, y_train)
x_prepared_test = self._encode_categorical_features(used_X_test)
x_prepared_test = self._encode_categorical_features(used_x_test)
self._calculate_cells()
self._modify_cells()
@ -302,11 +330,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
nodes = self._get_nodes_level(0)
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
# self._cells currently holds the generalization created from the tree leaves
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
self._calculate_generalizations(x_test)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
# check accuracy
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
@ -316,66 +347,79 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# if accuracy above threshold, improve generalization
if accuracy > self.target_accuracy:
print('Improving generalizations')
level = 1
self._level = 1
while accuracy > self.target_accuracy:
cells_previous_iter = self.cells
generalization_prev_iter = self._generalizations
cells_by_id_prev = self._cells_by_id
nodes = self._get_nodes_level(level)
nodes = self._get_nodes_level(self._level)
try:
self._calculate_level_cells(level)
self._calculate_level_cells(self._level)
except TypeError as e:
print(e)
self._level -= 1
break
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells,
self._calculate_generalizations(x_test)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
# if accuracy passed threshold roll back to previous iteration generalizations
if accuracy < self.target_accuracy:
self.cells = cells_previous_iter
self._generalizations = generalization_prev_iter
self._cells_by_id = cells_by_id_prev
self._level -= 1
break
else:
print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
level += 1
print('Pruned tree to level: %d, new relative accuracy: %f' % (self._level, accuracy))
self._level += 1
# if accuracy below threshold, improve accuracy by removing features from generalization
elif accuracy < self.target_accuracy:
print('Improving accuracy')
while accuracy < self.target_accuracy:
removed_feature = self._remove_feature_from_generalization(X_test, x_prepared_test,
removed_feature = self._remove_feature_from_generalization(x_test, x_prepared_test,
nodes, y_test,
feature_data, accuracy)
self._feature_data, accuracy,
self.generalize_using_transform)
if removed_feature is None:
break
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
self._calculate_generalizations(x_test)
if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self._cells currently holds the chosen generalization based on target accuracy
# calculate iLoss
self._ncp = self._calculate_ncp(X_test, self._generalizations, feature_data)
x_test_dataset = ArrayDataset(x_test, features_names=self._features)
self._ncp_scores.fit_score = self.calculate_ncp(x_test_dataset)
self._ncp_scores.generalizations_score = self.calculate_ncp(x_test_dataset)
# Return the transformer
return self
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
dataset: Optional[ArrayDataset] = None):
""" Transforms data records to representative points.
""" Transforms data records to representative points. Also sets the transform_score in self.ncp.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
passing the data as ``X`` and ``y``
:param features_names: The feature names, in the order that they appear in the data. Should be provided when
passing the data as ``X`` as a numpy array
:type features_names: list of strings, optional
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X`` OR ``dataset`` need to be provided, not both.
@ -383,69 +427,197 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
"""
if not self.generalize_using_transform:
raise ValueError('transform method called even though generalize_using_transform parameter was False. This '
'can lead to inconsistent results.')
transformed = self._inner_transform(X, features_names, dataset)
transformed_dataset = ArrayDataset(transformed, features_names=self._features)
self._ncp_scores.transform_score = self.calculate_ncp(transformed_dataset)
return transformed
def calculate_ncp(self, samples: ArrayDataset):
"""
Compute the NCP score of the generalization. Calculation is based on the value of the
generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the
provided data. If samples not provided, returns the last NCP score computed by the `fit` or `transform` method.
Based on the NCP score presented in: Ghinita, G., Karras, P., Kalnis, P., Mamoulis, N.: Fast data anonymization
with low information loss (https://www.vldb.org/conf/2007/papers/research/p758-ghinita.pdf)
:param samples: The input samples to compute the NCP score on.
:type samples: ArrayDataset, optional. feature_names should be set.
:return: NCP score as float.
"""
if not samples.features_names:
raise ValueError('features_names should be set in input ArrayDataset.')
samples_pd = pd.DataFrame(samples.get_samples(), columns=samples.features_names)
if self._features is None:
self._features = samples.features_names
if self._feature_data is None:
self._feature_data = self._get_feature_data(samples_pd)
total_samples = samples_pd.shape[0]
if self.generalize_using_transform:
generalizations = self._calculate_cell_generalizations()
# count how many records are mapped to each cell
counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted
ncp = 0
for cell in self.cells:
count = self._get_record_count_for_cell(samples_pd, cell, counted)
range_counts = {}
category_counts = {}
for feature in cell['ranges']:
range_counts[feature] = [count]
for feature in cell['categories']:
category_counts[feature] = [count]
ncp += self._calc_ncp_for_generalization(generalizations[cell['id']], range_counts, category_counts,
total_samples)
else: # use generalizations
generalizations = self.generalizations
range_counts = self._find_range_counts(samples_pd, generalizations['ranges'])
category_counts = self._find_category_counts(samples_pd, generalizations['categories'])
ncp = self._calc_ncp_for_generalization(generalizations, range_counts, category_counts, total_samples)
return ncp
def _inner_transform(self, x: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
dataset: Optional[ArrayDataset] = None):
# Check if fit has been called
msg = 'This %(name)s instance is not initialized yet. ' \
'Call fit or set_params with ' \
'appropriate arguments before using this method.'
check_is_fitted(self, ['cells'], msg=msg)
if X is not None:
if x is not None:
if dataset is not None:
raise ValueError('Either X OR dataset need to be provided, not both')
raise ValueError('Either x OR dataset need to be provided, not both')
else:
dataset = ArrayDataset(X, features_names=features_names)
dataset = ArrayDataset(x, features_names=features_names)
elif dataset is None:
raise ValueError('Either X OR dataset need to be provided, not both')
raise ValueError('Either x OR dataset need to be provided, not both')
if dataset and dataset.features_names:
if self._features is None:
self._features = dataset.features_names
if dataset and dataset.get_samples() is not None:
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
x_pd = pd.DataFrame(dataset.get_samples(), columns=self._features)
if x.shape[1] != self._n_features and self._n_features != 0:
if x_pd.shape[1] != self._n_features and self._n_features != 0:
raise ValueError('Shape of input is different from what was seen'
'in `fit`')
if not self._features:
self._features = [i for i in range(x.shape[1])]
self._features = [i for i in range(x_pd.shape[1])]
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
if self._dt: # only works if fit was called previously (but much more efficient)
nodes = self._get_nodes_level(self._level)
QI = x_pd.loc[:, self.features_to_minimize]
used_x = x_pd
if self.train_only_features_to_minimize:
used_x = QI
prepared = self._encode_categorical_features(used_x)
generalized = self._generalize_from_tree(x_pd, prepared, nodes, self.cells, self._cells_by_id)
else:
mapped = np.zeros(x_pd.shape[0]) # to mark records we already mapped
all_indexes = []
for i in range(len(self.cells)):
indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)
for cell in self.cells:
indexes = self._get_record_indexes_for_cell(x_pd, cell, mapped)
all_indexes.append(indexes)
generalized = self._generalize_indexes(x, self.cells, all_indexes)
generalized = self._generalize_indexes(x_pd, self.cells, all_indexes)
if dataset and dataset.is_pandas:
return generalized
elif isinstance(X, pd.DataFrame):
elif isinstance(x, pd.DataFrame):
return generalized
return generalized.to_numpy()
def _get_record_indexes_for_cell(self, X, cell, mapped):
def _calc_ncp_for_generalization(self, generalization, range_counts, category_counts, total_count):
total_ncp = 0
total_features = len(generalization['untouched'])
ranges = generalization['ranges']
categories = generalization['categories']
# suppressed features are already taken care of within _calc_ncp_numeric
for feature in ranges.keys():
feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
self._feature_data[feature], total_count)
total_ncp = total_ncp + feature_ncp
total_features += 1
for feature in categories.keys():
feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature],
self._feature_data[feature],
total_count)
total_ncp = total_ncp + feature_ncp
total_features += 1
if total_features == 0:
return 0
return total_ncp / total_features
@staticmethod
def _calc_ncp_categorical(categories, category_count, feature_data, total):
category_sizes = [len(g) if len(g) > 1 else 0 for g in categories]
normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, category_count)]
average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes)
return average_group_size / feature_data['range'] # number of values in category
@staticmethod
def _calc_ncp_numeric(range, range_count, feature_data, total):
# if there are no ranges, feature is suppressed and iLoss is 1
if not range:
return 1
# range only contains the split values, need to add min and max value of feature
# to enable computing sizes of all ranges
new_range = [feature_data['min']] + range + [feature_data['max']]
range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])]
normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)]
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
return average_range_size / (feature_data['max'] - feature_data['min'])
def _get_feature_data(self, x):
feature_data = {}
for feature in self._features:
if feature not in feature_data.keys():
fd = {}
values = list(x.loc[:, feature])
if feature not in self.categorical_features:
fd['min'] = min(values)
fd['max'] = max(values)
fd['range'] = max(values) - min(values)
else:
fd['range'] = len(np.unique(values))
feature_data[feature] = fd
return feature_data
def _get_record_indexes_for_cell(self, x, cell, mapped):
indexes = []
for index, row in X.iterrows():
for index, row in x.iterrows():
if not mapped.item(index) and self._cell_contains(cell, row, index, mapped):
indexes.append(index)
return indexes
def _cell_contains(self, cell, x, i, mapped):
for f in self._features:
if f in cell['ranges']:
if not self._cell_contains_numeric(f, cell['ranges'][f], x):
def _get_record_count_for_cell(self, x, cell, mapped):
count = 0
for index, (_, row) in enumerate(x.iterrows()):
if not mapped.item(index) and self._cell_contains(cell, row, index, mapped):
count += 1
return count
def _cell_contains(self, cell, row, index, mapped):
for i, feature in enumerate(self._features):
if feature in cell['ranges']:
if not self._cell_contains_numeric(i, cell['ranges'][feature], row):
return False
elif f in cell['categories']:
if not self._cell_contains_categorical(f, cell['categories'][f], x):
elif feature in cell['categories']:
if not self._cell_contains_categorical(i, cell['categories'][feature], row):
return False
elif f in cell['untouched']:
elif feature in cell['untouched']:
continue
else:
raise TypeError("feature " + f + "not found in cell" + cell['id'])
raise TypeError("feature " + feature + "not found in cell" + cell['id'])
# Mark as mapped
mapped.itemset(i, 1)
mapped.itemset(index, 1)
return True
def _encode_categorical_features(self, X, save_mapping=False):
def _encode_categorical_features(self, x, save_mapping=False):
if save_mapping:
self._categorical_values = {}
self._one_hot_vector_features_to_features = {}
@ -456,31 +628,31 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
for feature in self.categorical_features:
if feature in used_features:
try:
all_values = X.loc[:, feature]
all_values = x.loc[:, feature]
values = list(all_values.unique())
if save_mapping:
self._categorical_values[feature] = values
X[feature] = pd.Categorical(X.loc[:, feature], categories=self._categorical_values[feature],
x[feature] = pd.Categorical(x.loc[:, feature], categories=self._categorical_values[feature],
ordered=False)
ohe = pd.get_dummies(X[feature], prefix=feature)
ohe = pd.get_dummies(x[feature], prefix=feature)
if save_mapping:
for one_hot_vector_feature in ohe.columns:
self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
X = pd.concat([X, ohe], axis=1)
x = pd.concat([x, ohe], axis=1)
features_to_remove.append(feature)
except KeyError:
print("feature " + feature + "not found in training data")
new_data = X.drop(features_to_remove, axis=1)
new_data = x.drop(features_to_remove, axis=1)
if save_mapping:
self._encoded_features = new_data.columns
return new_data
def _cell_contains_numeric(self, f, range, x):
i = self._features.index(f)
# convert x to ndarray to allow indexing
a = np.array(x)
value = a.item(i)
@staticmethod
def _cell_contains_numeric(index, range, row):
# convert row to ndarray to allow indexing
a = np.array(row)
value = a.item(index)
if range['start']:
if value <= range['start']:
return False
@ -489,11 +661,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return False
return True
def _cell_contains_categorical(self, f, range, x):
i = self._features.index(f)
# convert x to ndarray to allow indexing
a = np.array(x)
value = a.item(i)
@staticmethod
def _cell_contains_categorical(index, range, row):
# convert row to ndarray to allow indexing
a = np.array(row)
value = a.item(index)
if value in range:
return True
return False
@ -685,7 +857,29 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
nodeSet = set(nodes)
return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths]
def _generalize(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
# method for applying generalizations (for global generalization-based acuuracy) without dt
def _generalize_from_generalizations(self, original_data, generalizations):
sample_indexes = self._map_to_ranges_categories(original_data,
generalizations['ranges'],
generalizations['categories'])
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
for feature in self._generalizations['categories']:
if 'untouched' not in generalizations or feature not in generalizations['untouched']:
for g_index, group in enumerate(generalizations['categories'][feature]):
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
if indexes:
rows = original_data_generalized.iloc[indexes]
rows[feature] = generalizations['category_representatives'][feature][g_index]
for feature in self._generalizations['ranges']:
if 'untouched' not in generalizations or feature not in generalizations['untouched']:
for r_index, range in enumerate(generalizations['ranges'][feature]):
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == r_index]
if indexes:
rows = original_data_generalized.iloc[indexes]
rows[feature] = generalizations['range_representatives'][feature][r_index]
return original_data_generalized
def _generalize_from_tree(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
mapping_to_cells = self._map_to_cells(prepared_data, level_nodes, cells_by_id)
all_indexes = []
for i in range(len(cells)):
@ -728,6 +922,29 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return original_data_generalized
@staticmethod
def _map_to_ranges_categories(samples, ranges, categories):
all_sample_indexes = []
for _, row in samples.iterrows():
sample_indexes = {}
for feature in ranges:
if not ranges[feature]:
# no values means whole range
sample_indexes[feature] = 0
else:
for index, value in enumerate(ranges[feature]):
if row[feature] <= value:
sample_indexes[feature] = index
break
sample_indexes[feature] = index + 1
for feature in categories:
for g_index, group in enumerate(categories[feature]):
if row[feature] in group:
sample_indexes[feature] = g_index
break
all_sample_indexes.append(sample_indexes)
return all_sample_indexes
def _map_to_cells(self, samples, nodes, cells_by_id):
mapping_to_cells = {}
for index, row in samples.iterrows():
@ -740,31 +957,35 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return [cells_by_id[nodeId] for nodeId in node_ids]
def _remove_feature_from_generalization(self, original_data, prepared_data, nodes, labels, feature_data,
current_accuracy):
current_accuracy, generalize_using_transform):
# prepared data include one hot encoded categorical data,
# if there is no categorical data prepared data is original data
feature = self._get_feature_to_remove(original_data, prepared_data, nodes, labels, feature_data,
current_accuracy)
current_accuracy, generalize_using_transform)
if feature is None:
return None
GeneralizeToRepresentative._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
return feature
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy):
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy,
generalize_using_transform):
# prepared data include one hot encoded categorical data,
# if there is no categorical data prepared data is original data
# We want to remove features with low iLoss (NCP) and high accuracy gain
# (after removing them)
ranges = self._generalizations['ranges']
range_counts = self._find_range_count(original_data, ranges)
range_counts = self._find_range_counts(original_data, ranges)
total = prepared_data.size
range_min = sys.float_info.max
remove_feature = None
categories = self.generalizations['categories']
category_counts = self._find_categories_count(original_data, categories)
category_counts = self._find_category_counts(original_data, categories)
for feature in ranges.keys():
if feature not in self._generalizations['untouched']:
if generalize_using_transform:
feature_ncp = self._calculate_ncp_for_feature_from_cells(feature, feature_data, original_data)
else:
feature_ncp = self._calc_ncp_numeric(ranges[feature],
range_counts[feature],
feature_data[feature],
@ -774,7 +995,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
labels)) - current_accuracy
if accuracy_gain < 0:
@ -788,6 +1010,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
for feature in categories.keys():
if feature not in self.generalizations['untouched']:
if generalize_using_transform:
feature_ncp = self._calculate_ncp_for_feature_from_cells(feature, feature_data, original_data)
else:
feature_ncp = self._calc_ncp_categorical(categories[feature],
category_counts[feature],
feature_data[feature],
@ -797,7 +1022,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
labels)) - current_accuracy
@ -812,31 +1038,119 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
print('feature to remove: ' + (str(remove_feature) if remove_feature is not None else 'none'))
return remove_feature
def _calculate_generalizations(self):
self._generalizations = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells),
'categories': GeneralizeToRepresentative._calculate_categories(self.cells),
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)}
self._remove_categorical_untouched(self._generalizations)
def _calculate_ncp_for_feature_from_cells(self, feature, feature_data, samples_pd):
# count how many records are mapped to each cell
counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted
total = samples_pd.shape[0]
feature_ncp = 0
for cell in self.cells:
count = self._get_record_count_for_cell(samples_pd, cell, counted)
generalizations = self._calculate_generalizations_for_cell(cell)
cell_ncp = 0
if feature in cell['ranges']:
cell_ncp = self._calc_ncp_numeric(generalizations['ranges'][feature],
[count],
feature_data[feature],
total)
elif feature in cell['categories']:
cell_ncp = self._calc_ncp_categorical(generalizations['categories'][feature],
[count],
feature_data[feature],
total)
feature_ncp += cell_ncp
return feature_ncp
def _find_range_count(self, samples, ranges):
samples_df = pd.DataFrame(samples, columns=self._encoded_features)
def _calculate_generalizations(self, samples: Optional[pd.DataFrame] = None):
ranges, range_representatives = self._calculate_ranges(self.cells)
categories, category_representatives = self._calculate_categories(self.cells)
self._generalizations = {'ranges': ranges,
'categories': categories,
'untouched': self._calculate_untouched(self.cells)}
self._remove_categorical_untouched(self._generalizations)
# compute representative value for each feature (based on data)
if samples is not None:
sample_indexes = self._map_to_ranges_categories(samples,
self._generalizations['ranges'],
self._generalizations['categories'])
# categorical - use most common value
old_category_representatives = category_representatives
category_representatives = {}
for feature in self._generalizations['categories']:
category_representatives[feature] = []
for g_index, group in enumerate(self._generalizations['categories'][feature]):
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
if indexes:
rows = samples.iloc[indexes]
values = rows[feature]
category = Counter(values).most_common(1)[0][0]
category_representatives[feature].append(category)
else:
category_representatives[feature].append(old_category_representatives[feature][g_index])
# numerical - use actual value closest to mean
old_range_representatives = range_representatives
range_representatives = {}
for feature in self._generalizations['ranges']:
range_representatives[feature] = []
# find the mean value (per feature)
for index in range(len(self._generalizations['ranges'][feature])):
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == index]
if indexes:
rows = samples.iloc[indexes]
values = rows[feature]
median = np.median(values)
min_value = max(values)
min_dist = float("inf")
for value in values:
# euclidean distance between two floating point values
dist = abs(value - median)
if dist < min_dist:
min_dist = dist
min_value = value
range_representatives[feature].append(min_value)
else:
range_representatives[feature].append(old_range_representatives[feature][index])
self._generalizations['category_representatives'] = category_representatives
self._generalizations['range_representatives'] = range_representatives
def _calculate_generalizations_for_cell(self, cell):
ranges, range_representatives = self._calculate_ranges([cell])
categories, category_representatives = self._calculate_categories([cell])
generalizations = {'ranges': ranges,
'categories': categories,
'untouched': self._calculate_untouched([cell]),
'range_representatives': range_representatives,
'category_representatives': category_representatives}
self._remove_categorical_untouched(generalizations)
return generalizations
def _calculate_cell_generalizations(self):
# calculate generalizations separately per cell
cell_generalizations = {}
for cell in self.cells:
cell_generalizations[cell['id']] = self._calculate_generalizations_for_cell(cell)
return cell_generalizations
@staticmethod
def _find_range_counts(samples, ranges):
range_counts = {}
last_value = None
for r in ranges.keys():
range_counts[r] = []
# if empty list, all samples should be counted
if not ranges[r]:
range_counts[r].append(samples_df.shape[0])
range_counts[r].append(samples.shape[0])
else:
for value in ranges[r]:
counter = [item for item in samples_df[r] if int(item) <= value]
counter = [item for item in samples[r] if int(item) <= value]
range_counts[r].append(len(counter))
last_value = value
counter = [item for item in samples_df[r] if int(item) <= last_value]
counter = [item for item in samples[r] if int(item) > last_value]
range_counts[r].append(len(counter))
return range_counts
def _find_categories_count(self, samples, categories):
@staticmethod
def _find_category_counts(samples, categories):
category_counts = {}
for c in categories.keys():
category_counts[c] = []
@ -844,34 +1158,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
category_counts[c].append(len(samples.loc[samples[c].isin(value)]))
return category_counts
def _calculate_ncp(self, samples, generalizations, feature_data):
# supressed features are already taken care of within _calc_ncp_numeric
ranges = generalizations['ranges']
categories = generalizations['categories']
range_counts = self._find_range_count(samples, ranges)
category_counts = self._find_categories_count(samples, categories)
total = samples.shape[0]
total_ncp = 0
total_features = len(generalizations['untouched'])
for feature in ranges.keys():
feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
feature_data[feature], total)
total_ncp = total_ncp + feature_ncp
total_features += 1
for feature in categories.keys():
featureNCP = self._calc_ncp_categorical(categories[feature], category_counts[feature],
feature_data[feature],
total)
total_ncp = total_ncp + featureNCP
total_features += 1
if total_features == 0:
return 0
return total_ncp / total_features
@staticmethod
def _calculate_ranges(cells):
ranges = {}
range_representatives = {}
for cell in cells:
for feature in [key for key in cell['ranges'].keys() if
'untouched' not in cell or key not in cell['untouched']]:
@ -881,17 +1171,37 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
ranges[feature].append(cell['ranges'][feature]['start'])
if cell['ranges'][feature]['end'] is not None:
ranges[feature].append(cell['ranges'][feature]['end'])
# default representative values (computed with no data)
for feature in ranges.keys():
range_representatives[feature] = []
if not ranges[feature]:
# no values means the complete range. Without data we cannot know what to put here.
# Using 0 as a placeholder.
range_representatives[feature].append(0)
else:
ranges[feature] = list(set(ranges[feature]))
ranges[feature].sort()
return ranges
prev_value = 0
for index, value in enumerate(ranges[feature]):
if index == 0:
# for first range, use min value
range_representatives[feature].append(value)
else:
# use middle of range (this will be a float)
range_representatives[feature].append((value - prev_value) / 2)
prev_value = value
# for last range use max value + 1
range_representatives[feature].append(prev_value + 1)
return ranges, range_representatives
@staticmethod
def _calculate_categories(cells):
categories = {}
category_representatives = {}
categorical_features_values = GeneralizeToRepresentative._calculate_categorical_features_values(cells)
for feature in categorical_features_values.keys():
partitions = []
category_representatives[feature] = []
values = categorical_features_values[feature]
assigned = []
for i in range(len(values)):
@ -908,8 +1218,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
partition.append(value2)
assigned.append(value2)
partitions.append(partition)
# default representative values (computed with no data)
category_representatives[feature].append(partition[0]) # random
categories[feature] = partitions
return categories
return categories, category_representatives
@staticmethod
def _calculate_categorical_features_values(cells):
@ -942,26 +1254,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
untouched = untouched.intersection(*untouched_lists)
return list(untouched)
@staticmethod
def _calc_ncp_categorical(categories, categoryCount, feature_data, total):
category_sizes = [len(g) if len(g) > 1 else 0 for g in categories]
normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, categoryCount)]
average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes)
return average_group_size / feature_data['range'] # number of values in category
@staticmethod
def _calc_ncp_numeric(feature_range, range_count, feature_data, total):
# if there are no ranges, feature is supressed and iLoss is 1
if not feature_range:
return 1
# range only contains the split values, need to add min and max value of feature
# to enable computing sizes of all ranges
new_range = [feature_data['min']] + feature_range + [feature_data['max']]
range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])]
normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)]
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
return average_range_size / (feature_data['max'] - feature_data['min'])
@staticmethod
def _remove_feature_from_cells(cells, cells_by_id, feature):
for cell in cells:

View file

@ -15,11 +15,12 @@ import pandas as pd
import logging
import torch
from torch import Tensor
from scipy.sparse import csr_matrix
logger = logging.getLogger(__name__)
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor]
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor, csr_matrix]
OUTPUT_DATA_ARRAY_TYPE = np.ndarray
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
@ -29,14 +30,16 @@ def array2numpy(arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
"""
converts from INPUT_DATA_ARRAY_TYPE to numpy array
"""
if type(arr) == np.ndarray:
if isinstance(arr, np.ndarray):
return arr
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
if isinstance(arr, pd.DataFrame) or isinstance(arr, pd.Series):
return arr.to_numpy()
if isinstance(arr, list):
return np.array(arr)
if type(arr) == Tensor:
if isinstance(arr, Tensor):
return arr.detach().cpu().numpy()
if isinstance(arr, csr_matrix):
return arr.toarray()
raise ValueError("Non supported type: ", type(arr).__name__)
@ -45,14 +48,16 @@ def array2torch_tensor(arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
"""
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
"""
if type(arr) == np.ndarray:
if isinstance(arr, np.ndarray):
return torch.from_numpy(arr)
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
if isinstance(arr, pd.DataFrame) or isinstance(arr, pd.Series):
return torch.from_numpy(arr.to_numpy())
if isinstance(arr, list):
return torch.tensor(arr)
if type(arr) == Tensor:
if isinstance(arr, Tensor):
return arr
if isinstance(arr, csr_matrix):
return torch.from_numpy(arr.toarray())
raise ValueError("Non supported type: ", type(arr).__name__)
@ -217,7 +222,7 @@ class ArrayDataset(Dataset):
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
features_names: Optional[list] = None, **kwargs):
self.is_pandas = self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series
self.is_pandas = self.is_pandas = isinstance(x, pd.DataFrame) or isinstance(x, pd.Series)
self.features_names = features_names
self._y = array2numpy(y) if y is not None else None
@ -325,7 +330,7 @@ class PytorchData(Dataset):
self._y = array2torch_tensor(y) if y is not None else None
self._x = array2torch_tensor(x)
self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series
self.is_pandas = isinstance(x, pd.DataFrame) or isinstance(x, pd.Series)
if self.is_pandas:
self.features_names = x.columns

View file

@ -43,7 +43,7 @@ def get_nb_classes(y: OUTPUT_DATA_ARRAY_TYPE) -> int:
if y is None:
return 0
if type(y) != np.ndarray:
if not isinstance(y, np.ndarray):
raise ValueError("Input should be numpy array")
if is_one_hot(y):
@ -339,8 +339,8 @@ class BlackboxClassifierPredictions(BlackboxClassifier):
y_test_pred = check_and_transform_label_format(y_test_pred, nb_classes=self._nb_classes)
if x_train_pred is not None and y_train_pred is not None and x_test_pred is not None and y_test_pred is not None:
if type(y_train_pred) != np.ndarray or type(y_test_pred) != np.ndarray \
or type(y_train_pred) != np.ndarray or type(y_test_pred) != np.ndarray:
if not isinstance(y_train_pred, np.ndarray) or not isinstance(y_test_pred, np.ndarray) \
or not isinstance(y_train_pred, np.ndarray) or not isinstance(y_test_pred, np.ndarray):
raise NotImplementedError("X/Y Data should be numpy array")
x_pred = np.vstack((x_train_pred, x_test_pred))
y_pred = np.vstack((y_train_pred, y_test_pred))

View file

@ -46,7 +46,7 @@ class SklearnClassifier(SklearnModel):
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
self._art_model = ArtSklearnClassifier(model)
self._art_model = ArtSklearnClassifier(model, preprocessing=None)
def fit(self, train_data: Dataset, **kwargs) -> None:
"""

View file

@ -1,6 +1,6 @@
numpy==1.22.0
pandas~=1.1.0
scipy==1.4.1
numpy==1.24.2
pandas==1.1.05
scipy==1.10.1
scikit-learn>=0.22.2,<=1.1.3
torch>=1.8.0
tqdm>=4.64.1

View file

@ -18,7 +18,7 @@ MIN_SHARE = 0.5
MIN_ROC_AUC = 0.0
MIN_PRECISION = 0.0
NUM_SYNTH_SAMPLES = 40000
NUM_SYNTH_SAMPLES = 400
NUM_SYNTH_COMPONENTS = 4
iris_dataset_np = get_iris_dataset_np()
@ -109,8 +109,8 @@ def kde(n_samples, n_components, original_data):
digit_data = original_data
pca = PCA(n_components=n_components, whiten=False)
data = pca.fit_transform(digit_data)
params = {'bandwidth': np.logspace(-1, 1, 20)}
grid = GridSearchCV(KernelDensity(), params, cv=5)
params = {'bandwidth': np.logspace(-1, 1, 10)}
grid = GridSearchCV(KernelDensity(), params, cv=2)
grid.fit(data)
kde_estimator = grid.best_estimator_

File diff suppressed because it is too large Load diff