mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-24 20:36:21 +02:00
Make data minimization more consistent and performant (#83)
* Update requirements * Update incompatible scipy version * Reduce runtime of dataset assessment tests * ncp is now a class that contains 3 values: fit_score, transform_score and generalizations_score so that it doesn't matter in what order the different methods are called, all calculated ncp scores are stored. Generalizations can now be applied either from tree cells or from global generalizations struct depending on the value of generalize_using_transform. Representative values can also be computed from global generalizations. Removing a feature from the generalization can also be applied in either mode. * Compute generalizations with test data when possible (for computing better representatives). * Externalize common test code to methods.
This commit is contained in:
parent
e9a225501f
commit
13a0567183
8 changed files with 1004 additions and 689 deletions
|
|
@ -1,6 +1,5 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from scipy.spatial import distance
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
from sklearn.compose import ColumnTransformer
|
from sklearn.compose import ColumnTransformer
|
||||||
|
|
@ -146,7 +145,8 @@ class Anonymize:
|
||||||
min_value = max(values)
|
min_value = max(values)
|
||||||
min_dist = float("inf")
|
min_dist = float("inf")
|
||||||
for value in values:
|
for value in values:
|
||||||
dist = distance.euclidean(value, median)
|
# euclidean distance between two floating point values
|
||||||
|
dist = abs(value - median)
|
||||||
if dist < min_dist:
|
if dist < min_dist:
|
||||||
min_dist = dist
|
min_dist = dist
|
||||||
min_value = value
|
min_value = value
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,8 @@
|
||||||
This module implements all classes needed to perform data minimization
|
This module implements all classes needed to perform data minimization
|
||||||
"""
|
"""
|
||||||
from typing import Union, Optional
|
from typing import Union, Optional
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from collections import Counter
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import copy
|
import copy
|
||||||
|
|
@ -20,6 +22,13 @@ from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
|
||||||
from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnClassifier
|
from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnClassifier
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NCPScores:
|
||||||
|
fit_score: float = None
|
||||||
|
transform_score: float = None
|
||||||
|
generalizations_score: float = None
|
||||||
|
|
||||||
|
|
||||||
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
|
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
|
||||||
"""
|
"""
|
||||||
A transformer that generalizes data to representative points.
|
A transformer that generalizes data to representative points.
|
||||||
|
|
@ -59,14 +68,23 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
|
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
|
||||||
Default is False.
|
Default is False.
|
||||||
:type is_regression: boolean, optional
|
:type is_regression: boolean, optional
|
||||||
|
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization
|
||||||
|
process. True means that the `transform` method is used to transform original
|
||||||
|
data into generalized data that is used for accuracy and NCP calculation.
|
||||||
|
False indicates that the `generalizations` structure should be used.
|
||||||
|
Default is True.
|
||||||
|
:type generalize_using_transform: boolean, optional
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998,
|
def __init__(self, estimator: Union[BaseEstimator, Model] = None,
|
||||||
cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None,
|
target_accuracy: Optional[float] = 0.998,
|
||||||
|
cells: Optional[list] = None,
|
||||||
|
categorical_features: Optional[Union[np.ndarray, list]] = None,
|
||||||
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
|
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
|
||||||
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
|
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
|
||||||
train_only_features_to_minimize: Optional[bool] = True,
|
train_only_features_to_minimize: Optional[bool] = True,
|
||||||
is_regression: Optional[bool] = False):
|
is_regression: Optional[bool] = False,
|
||||||
|
generalize_using_transform: bool = True):
|
||||||
|
|
||||||
self.estimator = estimator
|
self.estimator = estimator
|
||||||
if estimator is not None and not issubclass(estimator.__class__, Model):
|
if estimator is not None and not issubclass(estimator.__class__, Model):
|
||||||
|
|
@ -76,6 +94,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||||
self.target_accuracy = target_accuracy
|
self.target_accuracy = target_accuracy
|
||||||
self.cells = cells
|
self.cells = cells
|
||||||
|
if cells:
|
||||||
|
self._calculate_generalizations()
|
||||||
self.categorical_features = []
|
self.categorical_features = []
|
||||||
if categorical_features:
|
if categorical_features:
|
||||||
self.categorical_features = categorical_features
|
self.categorical_features = categorical_features
|
||||||
|
|
@ -83,6 +103,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self.train_only_features_to_minimize = train_only_features_to_minimize
|
self.train_only_features_to_minimize = train_only_features_to_minimize
|
||||||
self.is_regression = is_regression
|
self.is_regression = is_regression
|
||||||
self.encoder = encoder
|
self.encoder = encoder
|
||||||
|
self.generalize_using_transform = generalize_using_transform
|
||||||
|
self._ncp_scores = NCPScores()
|
||||||
|
self._feature_data = None
|
||||||
|
self._categorical_values = {}
|
||||||
|
self._dt = None
|
||||||
|
self._features = None
|
||||||
|
self._level = 0
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
"""
|
"""
|
||||||
|
|
@ -99,12 +126,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
ret['features_to_minimize'] = self.features_to_minimize
|
ret['features_to_minimize'] = self.features_to_minimize
|
||||||
ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
|
ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
|
||||||
ret['is_regression'] = self.is_regression
|
ret['is_regression'] = self.is_regression
|
||||||
|
ret['estimator'] = self.estimator
|
||||||
|
ret['encoder'] = self.encoder
|
||||||
if deep:
|
if deep:
|
||||||
ret['cells'] = copy.deepcopy(self.cells)
|
ret['cells'] = copy.deepcopy(self.cells)
|
||||||
ret['estimator'] = self.estimator
|
|
||||||
ret['encoder'] = self.encoder
|
|
||||||
else:
|
else:
|
||||||
ret['cells'] = copy.copy(self.cells)
|
ret['cells'] = copy.copy(self.cells)
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def set_params(self, **params):
|
def set_params(self, **params):
|
||||||
|
|
@ -132,6 +160,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self.is_regression = params['is_regression']
|
self.is_regression = params['is_regression']
|
||||||
if 'cells' in params:
|
if 'cells' in params:
|
||||||
self.cells = params['cells']
|
self.cells = params['cells']
|
||||||
|
if 'estimator' in params:
|
||||||
|
self.estimator = params['estimator']
|
||||||
|
if 'encoder' in params:
|
||||||
|
self.encoder = params['encoder']
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -140,24 +172,27 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
Return the generalizations derived from the model and test data.
|
Return the generalizations derived from the model and test data.
|
||||||
|
|
||||||
:return: generalizations object. Contains 3 sections: 'ranges' that contains ranges for numerical features,
|
:return: generalizations object. Contains 3 sections: 'ranges' that contains ranges for numerical features,
|
||||||
'categories' that contains sub-groups of categories for categorical features, and
|
'categories' that contains sub-groups of categories for categorical features, and
|
||||||
'untouched' that contains the features that could not be generalized.
|
'untouched' that contains the features that could not be generalized.
|
||||||
"""
|
"""
|
||||||
return self._generalizations
|
return self._generalizations
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ncp(self):
|
def ncp(self):
|
||||||
"""
|
"""
|
||||||
Return the NCP score of the generalizations.
|
Return the last calculated NCP scores. NCP score is calculated upon calling `fit` (on the training data),
|
||||||
|
`transform' (on the test data) or when explicitly calling `calculate_ncp` and providing it a dataset.
|
||||||
|
|
||||||
:return: ncp score as float.
|
:return: NCPScores object, that contains a score corresponding to the last fit call, one for the last
|
||||||
|
transform call, and a score based on global generalizations.
|
||||||
"""
|
"""
|
||||||
return self._ncp
|
return self._ncp_scores
|
||||||
|
|
||||||
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
||||||
features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
|
features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
|
||||||
"""
|
"""
|
||||||
Learns the generalizations based on training data, and applies them to the data.
|
Learns the generalizations based on training data, and applies them to the data. Also sets the fit_score,
|
||||||
|
transform_score and generalizations_score in self.ncp.
|
||||||
|
|
||||||
:param X: The training input samples.
|
:param X: The training input samples.
|
||||||
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
||||||
|
|
@ -172,19 +207,23 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
||||||
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
||||||
"""
|
"""
|
||||||
|
if not self.generalize_using_transform:
|
||||||
|
raise ValueError('fit_transform method called even though generalize_using_transform parameter was False. '
|
||||||
|
'This can lead to inconsistent results.')
|
||||||
self.fit(X, y, features_names, dataset=dataset)
|
self.fit(X, y, features_names, dataset=dataset)
|
||||||
return self.transform(X, features_names, dataset=dataset)
|
return self.transform(X, features_names, dataset=dataset)
|
||||||
|
|
||||||
def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
||||||
features_names: Optional = None, dataset: ArrayDataset = None):
|
features_names: Optional = None, dataset: ArrayDataset = None):
|
||||||
"""Learns the generalizations based on training data.
|
"""Learns the generalizations based on training data. Also sets the fit_score and generalizations_score in
|
||||||
|
self.ncp.
|
||||||
|
|
||||||
:param X: The training input samples.
|
:param X: The training input samples.
|
||||||
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
||||||
:param y: The target values. This should contain the predictions of the original model on ``X``.
|
:param y: The target values. This should contain the predictions of the original model on ``X``.
|
||||||
:type y: array-like, shape (n_samples,), optional
|
:type y: array-like, shape (n_samples,), optional
|
||||||
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
|
:param features_names: The feature names, in the order that they appear in the data. Should be provided when
|
||||||
passing the data as ``X`` and ``y``
|
passing the data as ``X`` as a numpy array
|
||||||
:type features_names: list of strings, optional
|
:type features_names: list of strings, optional
|
||||||
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
|
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
|
||||||
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
|
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
|
||||||
|
|
@ -223,46 +262,35 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self.features_to_minimize = [str(i) for i in self.features_to_minimize]
|
self.features_to_minimize = [str(i) for i in self.features_to_minimize]
|
||||||
if not all(elem in self._features for elem in self.features_to_minimize):
|
if not all(elem in self._features for elem in self.features_to_minimize):
|
||||||
raise ValueError('features to minimize should be a subset of features names')
|
raise ValueError('features to minimize should be a subset of features names')
|
||||||
x_QI = x.loc[:, self.features_to_minimize]
|
x_qi = x.loc[:, self.features_to_minimize]
|
||||||
|
|
||||||
# divide dataset into train and test
|
# divide dataset into train and test
|
||||||
used_data = x
|
used_data = x
|
||||||
if self.train_only_features_to_minimize:
|
if self.train_only_features_to_minimize:
|
||||||
used_data = x_QI
|
used_data = x_qi
|
||||||
if self.is_regression:
|
if self.is_regression:
|
||||||
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
|
x_train, x_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
|
||||||
random_state=14)
|
random_state=14)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(),
|
x_train, x_test, y_train, y_test = train_test_split(x, dataset.get_labels(),
|
||||||
stratify=dataset.get_labels(), test_size=0.4,
|
stratify=dataset.get_labels(), test_size=0.4,
|
||||||
random_state=18)
|
random_state=18)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print('Could not stratify split due to uncommon class value, doing unstratified split instead')
|
print('Could not stratify split due to uncommon class value, doing unstratified split instead')
|
||||||
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
|
x_train, x_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
|
||||||
random_state=18)
|
random_state=18)
|
||||||
|
|
||||||
X_train_QI = X_train.loc[:, self.features_to_minimize]
|
x_train_qi = x_train.loc[:, self.features_to_minimize]
|
||||||
X_test_QI = X_test.loc[:, self.features_to_minimize]
|
x_test_qi = x_test.loc[:, self.features_to_minimize]
|
||||||
used_X_train = X_train
|
used_x_train = x_train
|
||||||
used_X_test = X_test
|
used_x_test = x_test
|
||||||
if self.train_only_features_to_minimize:
|
if self.train_only_features_to_minimize:
|
||||||
used_X_train = X_train_QI
|
used_x_train = x_train_qi
|
||||||
used_X_test = X_test_QI
|
used_x_test = x_test_qi
|
||||||
|
|
||||||
# collect feature data (such as min, max)
|
# collect feature data (such as min, max)
|
||||||
feature_data = {}
|
self._feature_data = self._get_feature_data(x)
|
||||||
for feature in self._features:
|
|
||||||
if feature not in feature_data.keys():
|
|
||||||
fd = {}
|
|
||||||
values = list(x.loc[:, feature])
|
|
||||||
if feature not in self.categorical_features:
|
|
||||||
fd['min'] = min(values)
|
|
||||||
fd['max'] = max(values)
|
|
||||||
fd['range'] = max(values) - min(values)
|
|
||||||
else:
|
|
||||||
fd['range'] = len(np.unique(values))
|
|
||||||
feature_data[feature] = fd
|
|
||||||
|
|
||||||
# default encoder in case none provided
|
# default encoder in case none provided
|
||||||
if self.encoder is None:
|
if self.encoder is None:
|
||||||
|
|
@ -290,9 +318,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
|
|
||||||
# prepare data for DT
|
# prepare data for DT
|
||||||
self._encode_categorical_features(used_data, save_mapping=True)
|
self._encode_categorical_features(used_data, save_mapping=True)
|
||||||
x_prepared = self._encode_categorical_features(used_X_train)
|
x_prepared = self._encode_categorical_features(used_x_train)
|
||||||
self._dt.fit(x_prepared, y_train)
|
self._dt.fit(x_prepared, y_train)
|
||||||
x_prepared_test = self._encode_categorical_features(used_X_test)
|
x_prepared_test = self._encode_categorical_features(used_x_test)
|
||||||
|
|
||||||
self._calculate_cells()
|
self._calculate_cells()
|
||||||
self._modify_cells()
|
self._modify_cells()
|
||||||
|
|
@ -302,11 +330,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
|
self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
|
||||||
|
|
||||||
nodes = self._get_nodes_level(0)
|
nodes = self._get_nodes_level(0)
|
||||||
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
|
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
|
||||||
|
|
||||||
# self._cells currently holds the generalization created from the tree leaves
|
# self._cells currently holds the generalization created from the tree leaves
|
||||||
self._calculate_generalizations()
|
self._calculate_generalizations(x_test)
|
||||||
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
if self.generalize_using_transform:
|
||||||
|
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
||||||
|
else:
|
||||||
|
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||||
|
|
||||||
# check accuracy
|
# check accuracy
|
||||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
||||||
|
|
@ -316,66 +347,79 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
# if accuracy above threshold, improve generalization
|
# if accuracy above threshold, improve generalization
|
||||||
if accuracy > self.target_accuracy:
|
if accuracy > self.target_accuracy:
|
||||||
print('Improving generalizations')
|
print('Improving generalizations')
|
||||||
level = 1
|
self._level = 1
|
||||||
while accuracy > self.target_accuracy:
|
while accuracy > self.target_accuracy:
|
||||||
cells_previous_iter = self.cells
|
cells_previous_iter = self.cells
|
||||||
generalization_prev_iter = self._generalizations
|
generalization_prev_iter = self._generalizations
|
||||||
cells_by_id_prev = self._cells_by_id
|
cells_by_id_prev = self._cells_by_id
|
||||||
nodes = self._get_nodes_level(level)
|
nodes = self._get_nodes_level(self._level)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._calculate_level_cells(level)
|
self._calculate_level_cells(self._level)
|
||||||
except TypeError as e:
|
except TypeError as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
self._level -= 1
|
||||||
break
|
break
|
||||||
|
|
||||||
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
|
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
|
||||||
|
|
||||||
|
self._calculate_generalizations(x_test)
|
||||||
|
if self.generalize_using_transform:
|
||||||
|
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
||||||
|
self._cells_by_id)
|
||||||
|
else:
|
||||||
|
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||||
|
|
||||||
self._calculate_generalizations()
|
|
||||||
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells,
|
|
||||||
self._cells_by_id)
|
|
||||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
||||||
# if accuracy passed threshold roll back to previous iteration generalizations
|
# if accuracy passed threshold roll back to previous iteration generalizations
|
||||||
if accuracy < self.target_accuracy:
|
if accuracy < self.target_accuracy:
|
||||||
self.cells = cells_previous_iter
|
self.cells = cells_previous_iter
|
||||||
self._generalizations = generalization_prev_iter
|
self._generalizations = generalization_prev_iter
|
||||||
self._cells_by_id = cells_by_id_prev
|
self._cells_by_id = cells_by_id_prev
|
||||||
|
self._level -= 1
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
|
print('Pruned tree to level: %d, new relative accuracy: %f' % (self._level, accuracy))
|
||||||
level += 1
|
self._level += 1
|
||||||
|
|
||||||
# if accuracy below threshold, improve accuracy by removing features from generalization
|
# if accuracy below threshold, improve accuracy by removing features from generalization
|
||||||
elif accuracy < self.target_accuracy:
|
elif accuracy < self.target_accuracy:
|
||||||
print('Improving accuracy')
|
print('Improving accuracy')
|
||||||
while accuracy < self.target_accuracy:
|
while accuracy < self.target_accuracy:
|
||||||
removed_feature = self._remove_feature_from_generalization(X_test, x_prepared_test,
|
removed_feature = self._remove_feature_from_generalization(x_test, x_prepared_test,
|
||||||
nodes, y_test,
|
nodes, y_test,
|
||||||
feature_data, accuracy)
|
self._feature_data, accuracy,
|
||||||
|
self.generalize_using_transform)
|
||||||
if removed_feature is None:
|
if removed_feature is None:
|
||||||
break
|
break
|
||||||
|
|
||||||
self._calculate_generalizations()
|
self._calculate_generalizations(x_test)
|
||||||
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
if self.generalize_using_transform:
|
||||||
|
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
||||||
|
self._cells_by_id)
|
||||||
|
else:
|
||||||
|
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
||||||
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
|
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
|
||||||
|
|
||||||
# self._cells currently holds the chosen generalization based on target accuracy
|
# self._cells currently holds the chosen generalization based on target accuracy
|
||||||
|
|
||||||
# calculate iLoss
|
# calculate iLoss
|
||||||
self._ncp = self._calculate_ncp(X_test, self._generalizations, feature_data)
|
x_test_dataset = ArrayDataset(x_test, features_names=self._features)
|
||||||
|
self._ncp_scores.fit_score = self.calculate_ncp(x_test_dataset)
|
||||||
|
self._ncp_scores.generalizations_score = self.calculate_ncp(x_test_dataset)
|
||||||
|
|
||||||
# Return the transformer
|
# Return the transformer
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
|
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
|
||||||
dataset: Optional[ArrayDataset] = None):
|
dataset: Optional[ArrayDataset] = None):
|
||||||
""" Transforms data records to representative points.
|
""" Transforms data records to representative points. Also sets the transform_score in self.ncp.
|
||||||
|
|
||||||
:param X: The training input samples.
|
:param X: The training input samples.
|
||||||
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
||||||
:param features_names: The feature names, in the order that they appear in the data. Can be provided when
|
:param features_names: The feature names, in the order that they appear in the data. Should be provided when
|
||||||
passing the data as ``X`` and ``y``
|
passing the data as ``X`` as a numpy array
|
||||||
:type features_names: list of strings, optional
|
:type features_names: list of strings, optional
|
||||||
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
|
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
|
||||||
on the training data. Either ``X`` OR ``dataset`` need to be provided, not both.
|
on the training data. Either ``X`` OR ``dataset`` need to be provided, not both.
|
||||||
|
|
@ -383,69 +427,197 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
||||||
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
||||||
"""
|
"""
|
||||||
|
if not self.generalize_using_transform:
|
||||||
|
raise ValueError('transform method called even though generalize_using_transform parameter was False. This '
|
||||||
|
'can lead to inconsistent results.')
|
||||||
|
transformed = self._inner_transform(X, features_names, dataset)
|
||||||
|
transformed_dataset = ArrayDataset(transformed, features_names=self._features)
|
||||||
|
self._ncp_scores.transform_score = self.calculate_ncp(transformed_dataset)
|
||||||
|
return transformed
|
||||||
|
|
||||||
|
def calculate_ncp(self, samples: ArrayDataset):
|
||||||
|
"""
|
||||||
|
Compute the NCP score of the generalization. Calculation is based on the value of the
|
||||||
|
generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the
|
||||||
|
provided data. If samples not provided, returns the last NCP score computed by the `fit` or `transform` method.
|
||||||
|
|
||||||
|
Based on the NCP score presented in: Ghinita, G., Karras, P., Kalnis, P., Mamoulis, N.: Fast data anonymization
|
||||||
|
with low information loss (https://www.vldb.org/conf/2007/papers/research/p758-ghinita.pdf)
|
||||||
|
|
||||||
|
:param samples: The input samples to compute the NCP score on.
|
||||||
|
:type samples: ArrayDataset, optional. feature_names should be set.
|
||||||
|
:return: NCP score as float.
|
||||||
|
"""
|
||||||
|
if not samples.features_names:
|
||||||
|
raise ValueError('features_names should be set in input ArrayDataset.')
|
||||||
|
samples_pd = pd.DataFrame(samples.get_samples(), columns=samples.features_names)
|
||||||
|
if self._features is None:
|
||||||
|
self._features = samples.features_names
|
||||||
|
if self._feature_data is None:
|
||||||
|
self._feature_data = self._get_feature_data(samples_pd)
|
||||||
|
total_samples = samples_pd.shape[0]
|
||||||
|
|
||||||
|
if self.generalize_using_transform:
|
||||||
|
generalizations = self._calculate_cell_generalizations()
|
||||||
|
# count how many records are mapped to each cell
|
||||||
|
counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted
|
||||||
|
ncp = 0
|
||||||
|
for cell in self.cells:
|
||||||
|
count = self._get_record_count_for_cell(samples_pd, cell, counted)
|
||||||
|
range_counts = {}
|
||||||
|
category_counts = {}
|
||||||
|
for feature in cell['ranges']:
|
||||||
|
range_counts[feature] = [count]
|
||||||
|
for feature in cell['categories']:
|
||||||
|
category_counts[feature] = [count]
|
||||||
|
ncp += self._calc_ncp_for_generalization(generalizations[cell['id']], range_counts, category_counts,
|
||||||
|
total_samples)
|
||||||
|
else: # use generalizations
|
||||||
|
generalizations = self.generalizations
|
||||||
|
range_counts = self._find_range_counts(samples_pd, generalizations['ranges'])
|
||||||
|
category_counts = self._find_category_counts(samples_pd, generalizations['categories'])
|
||||||
|
ncp = self._calc_ncp_for_generalization(generalizations, range_counts, category_counts, total_samples)
|
||||||
|
|
||||||
|
return ncp
|
||||||
|
|
||||||
|
def _inner_transform(self, x: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
|
||||||
|
dataset: Optional[ArrayDataset] = None):
|
||||||
# Check if fit has been called
|
# Check if fit has been called
|
||||||
msg = 'This %(name)s instance is not initialized yet. ' \
|
msg = 'This %(name)s instance is not initialized yet. ' \
|
||||||
'Call ‘fit’ or ‘set_params’ with ' \
|
'Call ‘fit’ or ‘set_params’ with ' \
|
||||||
'appropriate arguments before using this method.'
|
'appropriate arguments before using this method.'
|
||||||
check_is_fitted(self, ['cells'], msg=msg)
|
check_is_fitted(self, ['cells'], msg=msg)
|
||||||
|
|
||||||
if X is not None:
|
if x is not None:
|
||||||
if dataset is not None:
|
if dataset is not None:
|
||||||
raise ValueError('Either X OR dataset need to be provided, not both')
|
raise ValueError('Either x OR dataset need to be provided, not both')
|
||||||
else:
|
else:
|
||||||
dataset = ArrayDataset(X, features_names=features_names)
|
dataset = ArrayDataset(x, features_names=features_names)
|
||||||
elif dataset is None:
|
elif dataset is None:
|
||||||
raise ValueError('Either X OR dataset need to be provided, not both')
|
raise ValueError('Either x OR dataset need to be provided, not both')
|
||||||
if dataset and dataset.features_names:
|
if dataset and dataset.features_names:
|
||||||
self._features = dataset.features_names
|
if self._features is None:
|
||||||
|
self._features = dataset.features_names
|
||||||
if dataset and dataset.get_samples() is not None:
|
if dataset and dataset.get_samples() is not None:
|
||||||
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
|
x_pd = pd.DataFrame(dataset.get_samples(), columns=self._features)
|
||||||
|
|
||||||
if x.shape[1] != self._n_features and self._n_features != 0:
|
if x_pd.shape[1] != self._n_features and self._n_features != 0:
|
||||||
raise ValueError('Shape of input is different from what was seen'
|
raise ValueError('Shape of input is different from what was seen'
|
||||||
'in `fit`')
|
'in `fit`')
|
||||||
|
|
||||||
if not self._features:
|
if not self._features:
|
||||||
self._features = [i for i in range(x.shape[1])]
|
self._features = [i for i in range(x_pd.shape[1])]
|
||||||
|
|
||||||
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
|
if self._dt: # only works if fit was called previously (but much more efficient)
|
||||||
all_indexes = []
|
nodes = self._get_nodes_level(self._level)
|
||||||
for i in range(len(self.cells)):
|
QI = x_pd.loc[:, self.features_to_minimize]
|
||||||
indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)
|
used_x = x_pd
|
||||||
all_indexes.append(indexes)
|
if self.train_only_features_to_minimize:
|
||||||
generalized = self._generalize_indexes(x, self.cells, all_indexes)
|
used_x = QI
|
||||||
|
prepared = self._encode_categorical_features(used_x)
|
||||||
|
generalized = self._generalize_from_tree(x_pd, prepared, nodes, self.cells, self._cells_by_id)
|
||||||
|
else:
|
||||||
|
mapped = np.zeros(x_pd.shape[0]) # to mark records we already mapped
|
||||||
|
all_indexes = []
|
||||||
|
for cell in self.cells:
|
||||||
|
indexes = self._get_record_indexes_for_cell(x_pd, cell, mapped)
|
||||||
|
all_indexes.append(indexes)
|
||||||
|
generalized = self._generalize_indexes(x_pd, self.cells, all_indexes)
|
||||||
|
|
||||||
if dataset and dataset.is_pandas:
|
if dataset and dataset.is_pandas:
|
||||||
return generalized
|
return generalized
|
||||||
elif isinstance(X, pd.DataFrame):
|
elif isinstance(x, pd.DataFrame):
|
||||||
return generalized
|
return generalized
|
||||||
return generalized.to_numpy()
|
return generalized.to_numpy()
|
||||||
|
|
||||||
def _get_record_indexes_for_cell(self, X, cell, mapped):
|
def _calc_ncp_for_generalization(self, generalization, range_counts, category_counts, total_count):
|
||||||
|
total_ncp = 0
|
||||||
|
total_features = len(generalization['untouched'])
|
||||||
|
ranges = generalization['ranges']
|
||||||
|
categories = generalization['categories']
|
||||||
|
|
||||||
|
# suppressed features are already taken care of within _calc_ncp_numeric
|
||||||
|
for feature in ranges.keys():
|
||||||
|
feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
|
||||||
|
self._feature_data[feature], total_count)
|
||||||
|
total_ncp = total_ncp + feature_ncp
|
||||||
|
total_features += 1
|
||||||
|
for feature in categories.keys():
|
||||||
|
feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature],
|
||||||
|
self._feature_data[feature],
|
||||||
|
total_count)
|
||||||
|
total_ncp = total_ncp + feature_ncp
|
||||||
|
total_features += 1
|
||||||
|
if total_features == 0:
|
||||||
|
return 0
|
||||||
|
return total_ncp / total_features
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _calc_ncp_categorical(categories, category_count, feature_data, total):
|
||||||
|
category_sizes = [len(g) if len(g) > 1 else 0 for g in categories]
|
||||||
|
normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, category_count)]
|
||||||
|
average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes)
|
||||||
|
return average_group_size / feature_data['range'] # number of values in category
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _calc_ncp_numeric(range, range_count, feature_data, total):
|
||||||
|
# if there are no ranges, feature is suppressed and iLoss is 1
|
||||||
|
if not range:
|
||||||
|
return 1
|
||||||
|
# range only contains the split values, need to add min and max value of feature
|
||||||
|
# to enable computing sizes of all ranges
|
||||||
|
new_range = [feature_data['min']] + range + [feature_data['max']]
|
||||||
|
range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])]
|
||||||
|
normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)]
|
||||||
|
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
|
||||||
|
return average_range_size / (feature_data['max'] - feature_data['min'])
|
||||||
|
|
||||||
|
def _get_feature_data(self, x):
|
||||||
|
feature_data = {}
|
||||||
|
for feature in self._features:
|
||||||
|
if feature not in feature_data.keys():
|
||||||
|
fd = {}
|
||||||
|
values = list(x.loc[:, feature])
|
||||||
|
if feature not in self.categorical_features:
|
||||||
|
fd['min'] = min(values)
|
||||||
|
fd['max'] = max(values)
|
||||||
|
fd['range'] = max(values) - min(values)
|
||||||
|
else:
|
||||||
|
fd['range'] = len(np.unique(values))
|
||||||
|
feature_data[feature] = fd
|
||||||
|
return feature_data
|
||||||
|
|
||||||
|
def _get_record_indexes_for_cell(self, x, cell, mapped):
|
||||||
indexes = []
|
indexes = []
|
||||||
for index, row in X.iterrows():
|
for index, row in x.iterrows():
|
||||||
if not mapped.item(index) and self._cell_contains(cell, row, index, mapped):
|
if not mapped.item(index) and self._cell_contains(cell, row, index, mapped):
|
||||||
indexes.append(index)
|
indexes.append(index)
|
||||||
return indexes
|
return indexes
|
||||||
|
|
||||||
def _cell_contains(self, cell, x, i, mapped):
|
def _get_record_count_for_cell(self, x, cell, mapped):
|
||||||
for f in self._features:
|
count = 0
|
||||||
if f in cell['ranges']:
|
for index, (_, row) in enumerate(x.iterrows()):
|
||||||
if not self._cell_contains_numeric(f, cell['ranges'][f], x):
|
if not mapped.item(index) and self._cell_contains(cell, row, index, mapped):
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
def _cell_contains(self, cell, row, index, mapped):
|
||||||
|
for i, feature in enumerate(self._features):
|
||||||
|
if feature in cell['ranges']:
|
||||||
|
if not self._cell_contains_numeric(i, cell['ranges'][feature], row):
|
||||||
return False
|
return False
|
||||||
elif f in cell['categories']:
|
elif feature in cell['categories']:
|
||||||
if not self._cell_contains_categorical(f, cell['categories'][f], x):
|
if not self._cell_contains_categorical(i, cell['categories'][feature], row):
|
||||||
return False
|
return False
|
||||||
elif f in cell['untouched']:
|
elif feature in cell['untouched']:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
raise TypeError("feature " + f + "not found in cell" + cell['id'])
|
raise TypeError("feature " + feature + "not found in cell" + cell['id'])
|
||||||
# Mark as mapped
|
# Mark as mapped
|
||||||
mapped.itemset(i, 1)
|
mapped.itemset(index, 1)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _encode_categorical_features(self, X, save_mapping=False):
|
def _encode_categorical_features(self, x, save_mapping=False):
|
||||||
if save_mapping:
|
if save_mapping:
|
||||||
self._categorical_values = {}
|
self._categorical_values = {}
|
||||||
self._one_hot_vector_features_to_features = {}
|
self._one_hot_vector_features_to_features = {}
|
||||||
|
|
@ -456,31 +628,31 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
for feature in self.categorical_features:
|
for feature in self.categorical_features:
|
||||||
if feature in used_features:
|
if feature in used_features:
|
||||||
try:
|
try:
|
||||||
all_values = X.loc[:, feature]
|
all_values = x.loc[:, feature]
|
||||||
values = list(all_values.unique())
|
values = list(all_values.unique())
|
||||||
if save_mapping:
|
if save_mapping:
|
||||||
self._categorical_values[feature] = values
|
self._categorical_values[feature] = values
|
||||||
X[feature] = pd.Categorical(X.loc[:, feature], categories=self._categorical_values[feature],
|
x[feature] = pd.Categorical(x.loc[:, feature], categories=self._categorical_values[feature],
|
||||||
ordered=False)
|
ordered=False)
|
||||||
ohe = pd.get_dummies(X[feature], prefix=feature)
|
ohe = pd.get_dummies(x[feature], prefix=feature)
|
||||||
if save_mapping:
|
if save_mapping:
|
||||||
for one_hot_vector_feature in ohe.columns:
|
for one_hot_vector_feature in ohe.columns:
|
||||||
self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
|
self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
|
||||||
X = pd.concat([X, ohe], axis=1)
|
x = pd.concat([x, ohe], axis=1)
|
||||||
features_to_remove.append(feature)
|
features_to_remove.append(feature)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
print("feature " + feature + "not found in training data")
|
print("feature " + feature + "not found in training data")
|
||||||
|
|
||||||
new_data = X.drop(features_to_remove, axis=1)
|
new_data = x.drop(features_to_remove, axis=1)
|
||||||
if save_mapping:
|
if save_mapping:
|
||||||
self._encoded_features = new_data.columns
|
self._encoded_features = new_data.columns
|
||||||
return new_data
|
return new_data
|
||||||
|
|
||||||
def _cell_contains_numeric(self, f, range, x):
|
@staticmethod
|
||||||
i = self._features.index(f)
|
def _cell_contains_numeric(index, range, row):
|
||||||
# convert x to ndarray to allow indexing
|
# convert row to ndarray to allow indexing
|
||||||
a = np.array(x)
|
a = np.array(row)
|
||||||
value = a.item(i)
|
value = a.item(index)
|
||||||
if range['start']:
|
if range['start']:
|
||||||
if value <= range['start']:
|
if value <= range['start']:
|
||||||
return False
|
return False
|
||||||
|
|
@ -489,11 +661,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _cell_contains_categorical(self, f, range, x):
|
@staticmethod
|
||||||
i = self._features.index(f)
|
def _cell_contains_categorical(index, range, row):
|
||||||
# convert x to ndarray to allow indexing
|
# convert row to ndarray to allow indexing
|
||||||
a = np.array(x)
|
a = np.array(row)
|
||||||
value = a.item(i)
|
value = a.item(index)
|
||||||
if value in range:
|
if value in range:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
@ -685,7 +857,29 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
nodeSet = set(nodes)
|
nodeSet = set(nodes)
|
||||||
return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths]
|
return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths]
|
||||||
|
|
||||||
def _generalize(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
|
# method for applying generalizations (for global generalization-based acuuracy) without dt
|
||||||
|
def _generalize_from_generalizations(self, original_data, generalizations):
|
||||||
|
sample_indexes = self._map_to_ranges_categories(original_data,
|
||||||
|
generalizations['ranges'],
|
||||||
|
generalizations['categories'])
|
||||||
|
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
|
||||||
|
for feature in self._generalizations['categories']:
|
||||||
|
if 'untouched' not in generalizations or feature not in generalizations['untouched']:
|
||||||
|
for g_index, group in enumerate(generalizations['categories'][feature]):
|
||||||
|
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
|
||||||
|
if indexes:
|
||||||
|
rows = original_data_generalized.iloc[indexes]
|
||||||
|
rows[feature] = generalizations['category_representatives'][feature][g_index]
|
||||||
|
for feature in self._generalizations['ranges']:
|
||||||
|
if 'untouched' not in generalizations or feature not in generalizations['untouched']:
|
||||||
|
for r_index, range in enumerate(generalizations['ranges'][feature]):
|
||||||
|
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == r_index]
|
||||||
|
if indexes:
|
||||||
|
rows = original_data_generalized.iloc[indexes]
|
||||||
|
rows[feature] = generalizations['range_representatives'][feature][r_index]
|
||||||
|
return original_data_generalized
|
||||||
|
|
||||||
|
def _generalize_from_tree(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
|
||||||
mapping_to_cells = self._map_to_cells(prepared_data, level_nodes, cells_by_id)
|
mapping_to_cells = self._map_to_cells(prepared_data, level_nodes, cells_by_id)
|
||||||
all_indexes = []
|
all_indexes = []
|
||||||
for i in range(len(cells)):
|
for i in range(len(cells)):
|
||||||
|
|
@ -728,6 +922,29 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
|
|
||||||
return original_data_generalized
|
return original_data_generalized
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _map_to_ranges_categories(samples, ranges, categories):
|
||||||
|
all_sample_indexes = []
|
||||||
|
for _, row in samples.iterrows():
|
||||||
|
sample_indexes = {}
|
||||||
|
for feature in ranges:
|
||||||
|
if not ranges[feature]:
|
||||||
|
# no values means whole range
|
||||||
|
sample_indexes[feature] = 0
|
||||||
|
else:
|
||||||
|
for index, value in enumerate(ranges[feature]):
|
||||||
|
if row[feature] <= value:
|
||||||
|
sample_indexes[feature] = index
|
||||||
|
break
|
||||||
|
sample_indexes[feature] = index + 1
|
||||||
|
for feature in categories:
|
||||||
|
for g_index, group in enumerate(categories[feature]):
|
||||||
|
if row[feature] in group:
|
||||||
|
sample_indexes[feature] = g_index
|
||||||
|
break
|
||||||
|
all_sample_indexes.append(sample_indexes)
|
||||||
|
return all_sample_indexes
|
||||||
|
|
||||||
def _map_to_cells(self, samples, nodes, cells_by_id):
|
def _map_to_cells(self, samples, nodes, cells_by_id):
|
||||||
mapping_to_cells = {}
|
mapping_to_cells = {}
|
||||||
for index, row in samples.iterrows():
|
for index, row in samples.iterrows():
|
||||||
|
|
@ -740,41 +957,46 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
return [cells_by_id[nodeId] for nodeId in node_ids]
|
return [cells_by_id[nodeId] for nodeId in node_ids]
|
||||||
|
|
||||||
def _remove_feature_from_generalization(self, original_data, prepared_data, nodes, labels, feature_data,
|
def _remove_feature_from_generalization(self, original_data, prepared_data, nodes, labels, feature_data,
|
||||||
current_accuracy):
|
current_accuracy, generalize_using_transform):
|
||||||
# prepared data include one hot encoded categorical data,
|
# prepared data include one hot encoded categorical data,
|
||||||
# if there is no categorical data prepared data is original data
|
# if there is no categorical data prepared data is original data
|
||||||
feature = self._get_feature_to_remove(original_data, prepared_data, nodes, labels, feature_data,
|
feature = self._get_feature_to_remove(original_data, prepared_data, nodes, labels, feature_data,
|
||||||
current_accuracy)
|
current_accuracy, generalize_using_transform)
|
||||||
if feature is None:
|
if feature is None:
|
||||||
return None
|
return None
|
||||||
GeneralizeToRepresentative._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
|
self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
|
||||||
return feature
|
return feature
|
||||||
|
|
||||||
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy):
|
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy,
|
||||||
|
generalize_using_transform):
|
||||||
# prepared data include one hot encoded categorical data,
|
# prepared data include one hot encoded categorical data,
|
||||||
# if there is no categorical data prepared data is original data
|
# if there is no categorical data prepared data is original data
|
||||||
# We want to remove features with low iLoss (NCP) and high accuracy gain
|
# We want to remove features with low iLoss (NCP) and high accuracy gain
|
||||||
# (after removing them)
|
# (after removing them)
|
||||||
ranges = self._generalizations['ranges']
|
ranges = self._generalizations['ranges']
|
||||||
range_counts = self._find_range_count(original_data, ranges)
|
range_counts = self._find_range_counts(original_data, ranges)
|
||||||
total = prepared_data.size
|
total = prepared_data.size
|
||||||
range_min = sys.float_info.max
|
range_min = sys.float_info.max
|
||||||
remove_feature = None
|
remove_feature = None
|
||||||
categories = self.generalizations['categories']
|
categories = self.generalizations['categories']
|
||||||
category_counts = self._find_categories_count(original_data, categories)
|
category_counts = self._find_category_counts(original_data, categories)
|
||||||
|
|
||||||
for feature in ranges.keys():
|
for feature in ranges.keys():
|
||||||
if feature not in self._generalizations['untouched']:
|
if feature not in self._generalizations['untouched']:
|
||||||
feature_ncp = self._calc_ncp_numeric(ranges[feature],
|
if generalize_using_transform:
|
||||||
range_counts[feature],
|
feature_ncp = self._calculate_ncp_for_feature_from_cells(feature, feature_data, original_data)
|
||||||
feature_data[feature],
|
else:
|
||||||
total)
|
feature_ncp = self._calc_ncp_numeric(ranges[feature],
|
||||||
|
range_counts[feature],
|
||||||
|
feature_data[feature],
|
||||||
|
total)
|
||||||
if feature_ncp > 0:
|
if feature_ncp > 0:
|
||||||
# divide by accuracy gain
|
# divide by accuracy gain
|
||||||
new_cells = copy.deepcopy(self.cells)
|
new_cells = copy.deepcopy(self.cells)
|
||||||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||||
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
|
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
|
||||||
|
cells_by_id)
|
||||||
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
||||||
labels)) - current_accuracy
|
labels)) - current_accuracy
|
||||||
if accuracy_gain < 0:
|
if accuracy_gain < 0:
|
||||||
|
|
@ -788,16 +1010,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
|
|
||||||
for feature in categories.keys():
|
for feature in categories.keys():
|
||||||
if feature not in self.generalizations['untouched']:
|
if feature not in self.generalizations['untouched']:
|
||||||
feature_ncp = self._calc_ncp_categorical(categories[feature],
|
if generalize_using_transform:
|
||||||
category_counts[feature],
|
feature_ncp = self._calculate_ncp_for_feature_from_cells(feature, feature_data, original_data)
|
||||||
feature_data[feature],
|
else:
|
||||||
total)
|
feature_ncp = self._calc_ncp_categorical(categories[feature],
|
||||||
|
category_counts[feature],
|
||||||
|
feature_data[feature],
|
||||||
|
total)
|
||||||
if feature_ncp > 0:
|
if feature_ncp > 0:
|
||||||
# divide by accuracy loss
|
# divide by accuracy loss
|
||||||
new_cells = copy.deepcopy(self.cells)
|
new_cells = copy.deepcopy(self.cells)
|
||||||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||||
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
|
generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells,
|
||||||
|
cells_by_id)
|
||||||
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
||||||
labels)) - current_accuracy
|
labels)) - current_accuracy
|
||||||
|
|
||||||
|
|
@ -812,31 +1038,119 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
print('feature to remove: ' + (str(remove_feature) if remove_feature is not None else 'none'))
|
print('feature to remove: ' + (str(remove_feature) if remove_feature is not None else 'none'))
|
||||||
return remove_feature
|
return remove_feature
|
||||||
|
|
||||||
def _calculate_generalizations(self):
|
def _calculate_ncp_for_feature_from_cells(self, feature, feature_data, samples_pd):
|
||||||
self._generalizations = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells),
|
# count how many records are mapped to each cell
|
||||||
'categories': GeneralizeToRepresentative._calculate_categories(self.cells),
|
counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted
|
||||||
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)}
|
total = samples_pd.shape[0]
|
||||||
self._remove_categorical_untouched(self._generalizations)
|
feature_ncp = 0
|
||||||
|
for cell in self.cells:
|
||||||
|
count = self._get_record_count_for_cell(samples_pd, cell, counted)
|
||||||
|
generalizations = self._calculate_generalizations_for_cell(cell)
|
||||||
|
cell_ncp = 0
|
||||||
|
if feature in cell['ranges']:
|
||||||
|
cell_ncp = self._calc_ncp_numeric(generalizations['ranges'][feature],
|
||||||
|
[count],
|
||||||
|
feature_data[feature],
|
||||||
|
total)
|
||||||
|
elif feature in cell['categories']:
|
||||||
|
cell_ncp = self._calc_ncp_categorical(generalizations['categories'][feature],
|
||||||
|
[count],
|
||||||
|
feature_data[feature],
|
||||||
|
total)
|
||||||
|
feature_ncp += cell_ncp
|
||||||
|
return feature_ncp
|
||||||
|
|
||||||
def _find_range_count(self, samples, ranges):
|
def _calculate_generalizations(self, samples: Optional[pd.DataFrame] = None):
|
||||||
samples_df = pd.DataFrame(samples, columns=self._encoded_features)
|
ranges, range_representatives = self._calculate_ranges(self.cells)
|
||||||
|
categories, category_representatives = self._calculate_categories(self.cells)
|
||||||
|
self._generalizations = {'ranges': ranges,
|
||||||
|
'categories': categories,
|
||||||
|
'untouched': self._calculate_untouched(self.cells)}
|
||||||
|
self._remove_categorical_untouched(self._generalizations)
|
||||||
|
# compute representative value for each feature (based on data)
|
||||||
|
if samples is not None:
|
||||||
|
sample_indexes = self._map_to_ranges_categories(samples,
|
||||||
|
self._generalizations['ranges'],
|
||||||
|
self._generalizations['categories'])
|
||||||
|
# categorical - use most common value
|
||||||
|
old_category_representatives = category_representatives
|
||||||
|
category_representatives = {}
|
||||||
|
for feature in self._generalizations['categories']:
|
||||||
|
category_representatives[feature] = []
|
||||||
|
for g_index, group in enumerate(self._generalizations['categories'][feature]):
|
||||||
|
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index]
|
||||||
|
if indexes:
|
||||||
|
rows = samples.iloc[indexes]
|
||||||
|
values = rows[feature]
|
||||||
|
category = Counter(values).most_common(1)[0][0]
|
||||||
|
category_representatives[feature].append(category)
|
||||||
|
else:
|
||||||
|
category_representatives[feature].append(old_category_representatives[feature][g_index])
|
||||||
|
|
||||||
|
# numerical - use actual value closest to mean
|
||||||
|
old_range_representatives = range_representatives
|
||||||
|
range_representatives = {}
|
||||||
|
for feature in self._generalizations['ranges']:
|
||||||
|
range_representatives[feature] = []
|
||||||
|
# find the mean value (per feature)
|
||||||
|
for index in range(len(self._generalizations['ranges'][feature])):
|
||||||
|
indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == index]
|
||||||
|
if indexes:
|
||||||
|
rows = samples.iloc[indexes]
|
||||||
|
values = rows[feature]
|
||||||
|
median = np.median(values)
|
||||||
|
min_value = max(values)
|
||||||
|
min_dist = float("inf")
|
||||||
|
for value in values:
|
||||||
|
# euclidean distance between two floating point values
|
||||||
|
dist = abs(value - median)
|
||||||
|
if dist < min_dist:
|
||||||
|
min_dist = dist
|
||||||
|
min_value = value
|
||||||
|
range_representatives[feature].append(min_value)
|
||||||
|
else:
|
||||||
|
range_representatives[feature].append(old_range_representatives[feature][index])
|
||||||
|
self._generalizations['category_representatives'] = category_representatives
|
||||||
|
self._generalizations['range_representatives'] = range_representatives
|
||||||
|
|
||||||
|
def _calculate_generalizations_for_cell(self, cell):
|
||||||
|
ranges, range_representatives = self._calculate_ranges([cell])
|
||||||
|
categories, category_representatives = self._calculate_categories([cell])
|
||||||
|
generalizations = {'ranges': ranges,
|
||||||
|
'categories': categories,
|
||||||
|
'untouched': self._calculate_untouched([cell]),
|
||||||
|
'range_representatives': range_representatives,
|
||||||
|
'category_representatives': category_representatives}
|
||||||
|
self._remove_categorical_untouched(generalizations)
|
||||||
|
return generalizations
|
||||||
|
|
||||||
|
def _calculate_cell_generalizations(self):
|
||||||
|
# calculate generalizations separately per cell
|
||||||
|
cell_generalizations = {}
|
||||||
|
for cell in self.cells:
|
||||||
|
cell_generalizations[cell['id']] = self._calculate_generalizations_for_cell(cell)
|
||||||
|
return cell_generalizations
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_range_counts(samples, ranges):
|
||||||
range_counts = {}
|
range_counts = {}
|
||||||
last_value = None
|
last_value = None
|
||||||
for r in ranges.keys():
|
for r in ranges.keys():
|
||||||
range_counts[r] = []
|
range_counts[r] = []
|
||||||
# if empty list, all samples should be counted
|
# if empty list, all samples should be counted
|
||||||
if not ranges[r]:
|
if not ranges[r]:
|
||||||
range_counts[r].append(samples_df.shape[0])
|
range_counts[r].append(samples.shape[0])
|
||||||
else:
|
else:
|
||||||
for value in ranges[r]:
|
for value in ranges[r]:
|
||||||
counter = [item for item in samples_df[r] if int(item) <= value]
|
counter = [item for item in samples[r] if int(item) <= value]
|
||||||
range_counts[r].append(len(counter))
|
range_counts[r].append(len(counter))
|
||||||
last_value = value
|
last_value = value
|
||||||
counter = [item for item in samples_df[r] if int(item) <= last_value]
|
counter = [item for item in samples[r] if int(item) > last_value]
|
||||||
range_counts[r].append(len(counter))
|
range_counts[r].append(len(counter))
|
||||||
return range_counts
|
return range_counts
|
||||||
|
|
||||||
def _find_categories_count(self, samples, categories):
|
@staticmethod
|
||||||
|
def _find_category_counts(samples, categories):
|
||||||
category_counts = {}
|
category_counts = {}
|
||||||
for c in categories.keys():
|
for c in categories.keys():
|
||||||
category_counts[c] = []
|
category_counts[c] = []
|
||||||
|
|
@ -844,34 +1158,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
category_counts[c].append(len(samples.loc[samples[c].isin(value)]))
|
category_counts[c].append(len(samples.loc[samples[c].isin(value)]))
|
||||||
return category_counts
|
return category_counts
|
||||||
|
|
||||||
def _calculate_ncp(self, samples, generalizations, feature_data):
|
|
||||||
# supressed features are already taken care of within _calc_ncp_numeric
|
|
||||||
ranges = generalizations['ranges']
|
|
||||||
categories = generalizations['categories']
|
|
||||||
range_counts = self._find_range_count(samples, ranges)
|
|
||||||
category_counts = self._find_categories_count(samples, categories)
|
|
||||||
|
|
||||||
total = samples.shape[0]
|
|
||||||
total_ncp = 0
|
|
||||||
total_features = len(generalizations['untouched'])
|
|
||||||
for feature in ranges.keys():
|
|
||||||
feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
|
|
||||||
feature_data[feature], total)
|
|
||||||
total_ncp = total_ncp + feature_ncp
|
|
||||||
total_features += 1
|
|
||||||
for feature in categories.keys():
|
|
||||||
featureNCP = self._calc_ncp_categorical(categories[feature], category_counts[feature],
|
|
||||||
feature_data[feature],
|
|
||||||
total)
|
|
||||||
total_ncp = total_ncp + featureNCP
|
|
||||||
total_features += 1
|
|
||||||
if total_features == 0:
|
|
||||||
return 0
|
|
||||||
return total_ncp / total_features
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _calculate_ranges(cells):
|
def _calculate_ranges(cells):
|
||||||
ranges = {}
|
ranges = {}
|
||||||
|
range_representatives = {}
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
for feature in [key for key in cell['ranges'].keys() if
|
for feature in [key for key in cell['ranges'].keys() if
|
||||||
'untouched' not in cell or key not in cell['untouched']]:
|
'untouched' not in cell or key not in cell['untouched']]:
|
||||||
|
|
@ -881,17 +1171,37 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
ranges[feature].append(cell['ranges'][feature]['start'])
|
ranges[feature].append(cell['ranges'][feature]['start'])
|
||||||
if cell['ranges'][feature]['end'] is not None:
|
if cell['ranges'][feature]['end'] is not None:
|
||||||
ranges[feature].append(cell['ranges'][feature]['end'])
|
ranges[feature].append(cell['ranges'][feature]['end'])
|
||||||
|
# default representative values (computed with no data)
|
||||||
for feature in ranges.keys():
|
for feature in ranges.keys():
|
||||||
ranges[feature] = list(set(ranges[feature]))
|
range_representatives[feature] = []
|
||||||
ranges[feature].sort()
|
if not ranges[feature]:
|
||||||
return ranges
|
# no values means the complete range. Without data we cannot know what to put here.
|
||||||
|
# Using 0 as a placeholder.
|
||||||
|
range_representatives[feature].append(0)
|
||||||
|
else:
|
||||||
|
ranges[feature] = list(set(ranges[feature]))
|
||||||
|
ranges[feature].sort()
|
||||||
|
prev_value = 0
|
||||||
|
for index, value in enumerate(ranges[feature]):
|
||||||
|
if index == 0:
|
||||||
|
# for first range, use min value
|
||||||
|
range_representatives[feature].append(value)
|
||||||
|
else:
|
||||||
|
# use middle of range (this will be a float)
|
||||||
|
range_representatives[feature].append((value - prev_value) / 2)
|
||||||
|
prev_value = value
|
||||||
|
# for last range use max value + 1
|
||||||
|
range_representatives[feature].append(prev_value + 1)
|
||||||
|
return ranges, range_representatives
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _calculate_categories(cells):
|
def _calculate_categories(cells):
|
||||||
categories = {}
|
categories = {}
|
||||||
|
category_representatives = {}
|
||||||
categorical_features_values = GeneralizeToRepresentative._calculate_categorical_features_values(cells)
|
categorical_features_values = GeneralizeToRepresentative._calculate_categorical_features_values(cells)
|
||||||
for feature in categorical_features_values.keys():
|
for feature in categorical_features_values.keys():
|
||||||
partitions = []
|
partitions = []
|
||||||
|
category_representatives[feature] = []
|
||||||
values = categorical_features_values[feature]
|
values = categorical_features_values[feature]
|
||||||
assigned = []
|
assigned = []
|
||||||
for i in range(len(values)):
|
for i in range(len(values)):
|
||||||
|
|
@ -908,8 +1218,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
partition.append(value2)
|
partition.append(value2)
|
||||||
assigned.append(value2)
|
assigned.append(value2)
|
||||||
partitions.append(partition)
|
partitions.append(partition)
|
||||||
|
# default representative values (computed with no data)
|
||||||
|
category_representatives[feature].append(partition[0]) # random
|
||||||
categories[feature] = partitions
|
categories[feature] = partitions
|
||||||
return categories
|
return categories, category_representatives
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _calculate_categorical_features_values(cells):
|
def _calculate_categorical_features_values(cells):
|
||||||
|
|
@ -942,26 +1254,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
untouched = untouched.intersection(*untouched_lists)
|
untouched = untouched.intersection(*untouched_lists)
|
||||||
return list(untouched)
|
return list(untouched)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _calc_ncp_categorical(categories, categoryCount, feature_data, total):
|
|
||||||
category_sizes = [len(g) if len(g) > 1 else 0 for g in categories]
|
|
||||||
normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, categoryCount)]
|
|
||||||
average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes)
|
|
||||||
return average_group_size / feature_data['range'] # number of values in category
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _calc_ncp_numeric(feature_range, range_count, feature_data, total):
|
|
||||||
# if there are no ranges, feature is supressed and iLoss is 1
|
|
||||||
if not feature_range:
|
|
||||||
return 1
|
|
||||||
# range only contains the split values, need to add min and max value of feature
|
|
||||||
# to enable computing sizes of all ranges
|
|
||||||
new_range = [feature_data['min']] + feature_range + [feature_data['max']]
|
|
||||||
range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])]
|
|
||||||
normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)]
|
|
||||||
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
|
|
||||||
return average_range_size / (feature_data['max'] - feature_data['min'])
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _remove_feature_from_cells(cells, cells_by_id, feature):
|
def _remove_feature_from_cells(cells, cells_by_id, feature):
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
|
|
|
||||||
|
|
@ -15,11 +15,12 @@ import pandas as pd
|
||||||
import logging
|
import logging
|
||||||
import torch
|
import torch
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
|
from scipy.sparse import csr_matrix
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor]
|
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor, csr_matrix]
|
||||||
OUTPUT_DATA_ARRAY_TYPE = np.ndarray
|
OUTPUT_DATA_ARRAY_TYPE = np.ndarray
|
||||||
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
|
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
|
||||||
|
|
||||||
|
|
@ -29,14 +30,16 @@ def array2numpy(arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||||
"""
|
"""
|
||||||
converts from INPUT_DATA_ARRAY_TYPE to numpy array
|
converts from INPUT_DATA_ARRAY_TYPE to numpy array
|
||||||
"""
|
"""
|
||||||
if type(arr) == np.ndarray:
|
if isinstance(arr, np.ndarray):
|
||||||
return arr
|
return arr
|
||||||
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
|
if isinstance(arr, pd.DataFrame) or isinstance(arr, pd.Series):
|
||||||
return arr.to_numpy()
|
return arr.to_numpy()
|
||||||
if isinstance(arr, list):
|
if isinstance(arr, list):
|
||||||
return np.array(arr)
|
return np.array(arr)
|
||||||
if type(arr) == Tensor:
|
if isinstance(arr, Tensor):
|
||||||
return arr.detach().cpu().numpy()
|
return arr.detach().cpu().numpy()
|
||||||
|
if isinstance(arr, csr_matrix):
|
||||||
|
return arr.toarray()
|
||||||
|
|
||||||
raise ValueError("Non supported type: ", type(arr).__name__)
|
raise ValueError("Non supported type: ", type(arr).__name__)
|
||||||
|
|
||||||
|
|
@ -45,14 +48,16 @@ def array2torch_tensor(arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
|
||||||
"""
|
"""
|
||||||
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
|
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
|
||||||
"""
|
"""
|
||||||
if type(arr) == np.ndarray:
|
if isinstance(arr, np.ndarray):
|
||||||
return torch.from_numpy(arr)
|
return torch.from_numpy(arr)
|
||||||
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
|
if isinstance(arr, pd.DataFrame) or isinstance(arr, pd.Series):
|
||||||
return torch.from_numpy(arr.to_numpy())
|
return torch.from_numpy(arr.to_numpy())
|
||||||
if isinstance(arr, list):
|
if isinstance(arr, list):
|
||||||
return torch.tensor(arr)
|
return torch.tensor(arr)
|
||||||
if type(arr) == Tensor:
|
if isinstance(arr, Tensor):
|
||||||
return arr
|
return arr
|
||||||
|
if isinstance(arr, csr_matrix):
|
||||||
|
return torch.from_numpy(arr.toarray())
|
||||||
|
|
||||||
raise ValueError("Non supported type: ", type(arr).__name__)
|
raise ValueError("Non supported type: ", type(arr).__name__)
|
||||||
|
|
||||||
|
|
@ -217,7 +222,7 @@ class ArrayDataset(Dataset):
|
||||||
|
|
||||||
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
|
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
|
||||||
features_names: Optional[list] = None, **kwargs):
|
features_names: Optional[list] = None, **kwargs):
|
||||||
self.is_pandas = self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series
|
self.is_pandas = self.is_pandas = isinstance(x, pd.DataFrame) or isinstance(x, pd.Series)
|
||||||
|
|
||||||
self.features_names = features_names
|
self.features_names = features_names
|
||||||
self._y = array2numpy(y) if y is not None else None
|
self._y = array2numpy(y) if y is not None else None
|
||||||
|
|
@ -325,7 +330,7 @@ class PytorchData(Dataset):
|
||||||
self._y = array2torch_tensor(y) if y is not None else None
|
self._y = array2torch_tensor(y) if y is not None else None
|
||||||
self._x = array2torch_tensor(x)
|
self._x = array2torch_tensor(x)
|
||||||
|
|
||||||
self.is_pandas = type(x) == pd.DataFrame or type(x) == pd.Series
|
self.is_pandas = isinstance(x, pd.DataFrame) or isinstance(x, pd.Series)
|
||||||
|
|
||||||
if self.is_pandas:
|
if self.is_pandas:
|
||||||
self.features_names = x.columns
|
self.features_names = x.columns
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,7 @@ def get_nb_classes(y: OUTPUT_DATA_ARRAY_TYPE) -> int:
|
||||||
if y is None:
|
if y is None:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if type(y) != np.ndarray:
|
if not isinstance(y, np.ndarray):
|
||||||
raise ValueError("Input should be numpy array")
|
raise ValueError("Input should be numpy array")
|
||||||
|
|
||||||
if is_one_hot(y):
|
if is_one_hot(y):
|
||||||
|
|
@ -339,8 +339,8 @@ class BlackboxClassifierPredictions(BlackboxClassifier):
|
||||||
y_test_pred = check_and_transform_label_format(y_test_pred, nb_classes=self._nb_classes)
|
y_test_pred = check_and_transform_label_format(y_test_pred, nb_classes=self._nb_classes)
|
||||||
|
|
||||||
if x_train_pred is not None and y_train_pred is not None and x_test_pred is not None and y_test_pred is not None:
|
if x_train_pred is not None and y_train_pred is not None and x_test_pred is not None and y_test_pred is not None:
|
||||||
if type(y_train_pred) != np.ndarray or type(y_test_pred) != np.ndarray \
|
if not isinstance(y_train_pred, np.ndarray) or not isinstance(y_test_pred, np.ndarray) \
|
||||||
or type(y_train_pred) != np.ndarray or type(y_test_pred) != np.ndarray:
|
or not isinstance(y_train_pred, np.ndarray) or not isinstance(y_test_pred, np.ndarray):
|
||||||
raise NotImplementedError("X/Y Data should be numpy array")
|
raise NotImplementedError("X/Y Data should be numpy array")
|
||||||
x_pred = np.vstack((x_train_pred, x_test_pred))
|
x_pred = np.vstack((x_train_pred, x_test_pred))
|
||||||
y_pred = np.vstack((y_train_pred, y_test_pred))
|
y_pred = np.vstack((y_train_pred, y_test_pred))
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@ class SklearnClassifier(SklearnModel):
|
||||||
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
|
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
|
||||||
unlimited_queries: Optional[bool] = True, **kwargs):
|
unlimited_queries: Optional[bool] = True, **kwargs):
|
||||||
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
|
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
|
||||||
self._art_model = ArtSklearnClassifier(model)
|
self._art_model = ArtSklearnClassifier(model, preprocessing=None)
|
||||||
|
|
||||||
def fit(self, train_data: Dataset, **kwargs) -> None:
|
def fit(self, train_data: Dataset, **kwargs) -> None:
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
numpy==1.22.0
|
numpy==1.24.2
|
||||||
pandas~=1.1.0
|
pandas==1.1.05
|
||||||
scipy==1.4.1
|
scipy==1.10.1
|
||||||
scikit-learn>=0.22.2,<=1.1.3
|
scikit-learn>=0.22.2,<=1.1.3
|
||||||
torch>=1.8.0
|
torch>=1.8.0
|
||||||
tqdm>=4.64.1
|
tqdm>=4.64.1
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ MIN_SHARE = 0.5
|
||||||
MIN_ROC_AUC = 0.0
|
MIN_ROC_AUC = 0.0
|
||||||
MIN_PRECISION = 0.0
|
MIN_PRECISION = 0.0
|
||||||
|
|
||||||
NUM_SYNTH_SAMPLES = 40000
|
NUM_SYNTH_SAMPLES = 400
|
||||||
NUM_SYNTH_COMPONENTS = 4
|
NUM_SYNTH_COMPONENTS = 4
|
||||||
|
|
||||||
iris_dataset_np = get_iris_dataset_np()
|
iris_dataset_np = get_iris_dataset_np()
|
||||||
|
|
@ -109,8 +109,8 @@ def kde(n_samples, n_components, original_data):
|
||||||
digit_data = original_data
|
digit_data = original_data
|
||||||
pca = PCA(n_components=n_components, whiten=False)
|
pca = PCA(n_components=n_components, whiten=False)
|
||||||
data = pca.fit_transform(digit_data)
|
data = pca.fit_transform(digit_data)
|
||||||
params = {'bandwidth': np.logspace(-1, 1, 20)}
|
params = {'bandwidth': np.logspace(-1, 1, 10)}
|
||||||
grid = GridSearchCV(KernelDensity(), params, cv=5)
|
grid = GridSearchCV(KernelDensity(), params, cv=2)
|
||||||
grid.fit(data)
|
grid.fit(data)
|
||||||
|
|
||||||
kde_estimator = grid.best_estimator_
|
kde_estimator = grid.best_estimator_
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue