mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-05-09 03:52:38 +02:00
Initial commit. Tests not yet passing.
Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
c3d2e9c7d0
commit
51340fa554
2 changed files with 233 additions and 89 deletions
|
|
@ -41,6 +41,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:param target_accuracy: The required relative accuracy when applying the base model to the generalized data.
|
:param target_accuracy: The required relative accuracy when applying the base model to the generalized data.
|
||||||
Accuracy is measured relative to the original accuracy of the model.
|
Accuracy is measured relative to the original accuracy of the model.
|
||||||
:type target_accuracy: float, optional
|
:type target_accuracy: float, optional
|
||||||
|
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization process.
|
||||||
|
True means that the `transform` method is used to transform original data into
|
||||||
|
generalized data that is used for accuracy and NCP calculation. False indicates
|
||||||
|
that the `generalizations` structure should be used. Default is True.
|
||||||
|
:type generalize_using_transform: boolean, optional
|
||||||
:param cells: The cells used to generalize records. Each cell must define a range or subset of categories for
|
:param cells: The cells used to generalize records. Each cell must define a range or subset of categories for
|
||||||
each feature, as well as a representative value for each feature. This parameter should be used
|
each feature, as well as a representative value for each feature. This parameter should be used
|
||||||
when instantiating a transformer object without first fitting it.
|
when instantiating a transformer object without first fitting it.
|
||||||
|
|
@ -61,8 +66,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:type is_regression: boolean, optional
|
:type is_regression: boolean, optional
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998,
|
def __init__(self, estimator: Union[BaseEstimator, Model] = None,
|
||||||
cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None,
|
target_accuracy: Optional[float] = 0.998,
|
||||||
|
generalize_using_transform: Optional[bool] = True,
|
||||||
|
cells: Optional[list] = None,
|
||||||
|
categorical_features: Optional[Union[np.ndarray, list]] = None,
|
||||||
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
|
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
|
||||||
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
|
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
|
||||||
train_only_features_to_minimize: Optional[bool] = True,
|
train_only_features_to_minimize: Optional[bool] = True,
|
||||||
|
|
@ -76,6 +84,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||||
self.target_accuracy = target_accuracy
|
self.target_accuracy = target_accuracy
|
||||||
self.cells = cells
|
self.cells = cells
|
||||||
|
if cells:
|
||||||
|
self._calculate_generalizations()
|
||||||
self.categorical_features = []
|
self.categorical_features = []
|
||||||
if categorical_features:
|
if categorical_features:
|
||||||
self.categorical_features = categorical_features
|
self.categorical_features = categorical_features
|
||||||
|
|
@ -83,6 +93,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self.train_only_features_to_minimize = train_only_features_to_minimize
|
self.train_only_features_to_minimize = train_only_features_to_minimize
|
||||||
self.is_regression = is_regression
|
self.is_regression = is_regression
|
||||||
self.encoder = encoder
|
self.encoder = encoder
|
||||||
|
# self.generalize_using_transform = generalize_using_transform
|
||||||
|
self.generalize_using_transform = False
|
||||||
|
self._ncp = 0.0
|
||||||
|
self._feature_data = {}
|
||||||
|
self._categorical_values = {}
|
||||||
|
self._dt = None
|
||||||
|
self._features = None
|
||||||
|
self._level = 0
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
"""
|
"""
|
||||||
|
|
@ -99,6 +117,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
ret['features_to_minimize'] = self.features_to_minimize
|
ret['features_to_minimize'] = self.features_to_minimize
|
||||||
ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
|
ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
|
||||||
ret['is_regression'] = self.is_regression
|
ret['is_regression'] = self.is_regression
|
||||||
|
ret['generalize_using_transform'] = self.generalize_using_transform
|
||||||
if deep:
|
if deep:
|
||||||
ret['cells'] = copy.deepcopy(self.cells)
|
ret['cells'] = copy.deepcopy(self.cells)
|
||||||
ret['estimator'] = self.estimator
|
ret['estimator'] = self.estimator
|
||||||
|
|
@ -132,6 +151,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self.is_regression = params['is_regression']
|
self.is_regression = params['is_regression']
|
||||||
if 'cells' in params:
|
if 'cells' in params:
|
||||||
self.cells = params['cells']
|
self.cells = params['cells']
|
||||||
|
if 'generalize_using_transform' in params:
|
||||||
|
self.generalize_using_transform = params['generalize_using_transform']
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -140,17 +161,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
Return the generalizations derived from the model and test data.
|
Return the generalizations derived from the model and test data.
|
||||||
|
|
||||||
:return: generalizations object. Contains 3 sections: 'ranges' that contains ranges for numerical features,
|
:return: generalizations object. Contains 3 sections: 'ranges' that contains ranges for numerical features,
|
||||||
'categories' that contains sub-groups of categories for categorical features, and
|
'categories' that contains sub-groups of categories for categorical features, and
|
||||||
'untouched' that contains the features that could not be generalized.
|
'untouched' that contains the features that could not be generalized.
|
||||||
"""
|
"""
|
||||||
return self._generalizations
|
return self._generalizations
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ncp(self):
|
def ncp(self):
|
||||||
"""
|
"""
|
||||||
Return the NCP score of the generalizations.
|
Return the last calculated NCP score. NCP score is calculated upon calling `fit` (on the training data),
|
||||||
|
`transform' (on the test data) or when explicitly calling `calculate_ncp` and providing it a dataset.
|
||||||
|
|
||||||
:return: ncp score as float.
|
:return: NCP score as float.
|
||||||
"""
|
"""
|
||||||
return self._ncp
|
return self._ncp
|
||||||
|
|
||||||
|
|
@ -251,9 +273,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
used_X_test = X_test_QI
|
used_X_test = X_test_QI
|
||||||
|
|
||||||
# collect feature data (such as min, max)
|
# collect feature data (such as min, max)
|
||||||
feature_data = {}
|
self._feature_data = {}
|
||||||
for feature in self._features:
|
for feature in self._features:
|
||||||
if feature not in feature_data.keys():
|
if feature not in self._feature_data.keys():
|
||||||
fd = {}
|
fd = {}
|
||||||
values = list(x.loc[:, feature])
|
values = list(x.loc[:, feature])
|
||||||
if feature not in self.categorical_features:
|
if feature not in self.categorical_features:
|
||||||
|
|
@ -262,7 +284,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
fd['range'] = max(values) - min(values)
|
fd['range'] = max(values) - min(values)
|
||||||
else:
|
else:
|
||||||
fd['range'] = len(np.unique(values))
|
fd['range'] = len(np.unique(values))
|
||||||
feature_data[feature] = fd
|
self._feature_data[feature] = fd
|
||||||
|
|
||||||
# default encoder in case none provided
|
# default encoder in case none provided
|
||||||
if self.encoder is None:
|
if self.encoder is None:
|
||||||
|
|
@ -316,17 +338,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
# if accuracy above threshold, improve generalization
|
# if accuracy above threshold, improve generalization
|
||||||
if accuracy > self.target_accuracy:
|
if accuracy > self.target_accuracy:
|
||||||
print('Improving generalizations')
|
print('Improving generalizations')
|
||||||
level = 1
|
self._level = 1
|
||||||
while accuracy > self.target_accuracy:
|
while accuracy > self.target_accuracy:
|
||||||
cells_previous_iter = self.cells
|
cells_previous_iter = self.cells
|
||||||
generalization_prev_iter = self._generalizations
|
generalization_prev_iter = self._generalizations
|
||||||
cells_by_id_prev = self._cells_by_id
|
cells_by_id_prev = self._cells_by_id
|
||||||
nodes = self._get_nodes_level(level)
|
nodes = self._get_nodes_level(self._level)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._calculate_level_cells(level)
|
self._calculate_level_cells(self._level)
|
||||||
except TypeError as e:
|
except TypeError as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
self._level -= 1
|
||||||
break
|
break
|
||||||
|
|
||||||
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
|
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
|
||||||
|
|
@ -340,10 +363,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self.cells = cells_previous_iter
|
self.cells = cells_previous_iter
|
||||||
self._generalizations = generalization_prev_iter
|
self._generalizations = generalization_prev_iter
|
||||||
self._cells_by_id = cells_by_id_prev
|
self._cells_by_id = cells_by_id_prev
|
||||||
|
self._level -= 1
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
|
print('Pruned tree to level: %d, new relative accuracy: %f' % (self._level, accuracy))
|
||||||
level += 1
|
self._level += 1
|
||||||
|
|
||||||
# if accuracy below threshold, improve accuracy by removing features from generalization
|
# if accuracy below threshold, improve accuracy by removing features from generalization
|
||||||
elif accuracy < self.target_accuracy:
|
elif accuracy < self.target_accuracy:
|
||||||
|
|
@ -351,7 +375,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
while accuracy < self.target_accuracy:
|
while accuracy < self.target_accuracy:
|
||||||
removed_feature = self._remove_feature_from_generalization(X_test, x_prepared_test,
|
removed_feature = self._remove_feature_from_generalization(X_test, x_prepared_test,
|
||||||
nodes, y_test,
|
nodes, y_test,
|
||||||
feature_data, accuracy)
|
self._feature_data, accuracy)
|
||||||
if removed_feature is None:
|
if removed_feature is None:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
@ -363,7 +387,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
# self._cells currently holds the chosen generalization based on target accuracy
|
# self._cells currently holds the chosen generalization based on target accuracy
|
||||||
|
|
||||||
# calculate iLoss
|
# calculate iLoss
|
||||||
self._ncp = self._calculate_ncp(X_test, self._generalizations, feature_data)
|
self.calculate_ncp(X_test)
|
||||||
|
|
||||||
# Return the transformer
|
# Return the transformer
|
||||||
return self
|
return self
|
||||||
|
|
@ -383,7 +407,66 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
||||||
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
||||||
"""
|
"""
|
||||||
|
transformed = self._inner_transform(X, features_names, dataset)
|
||||||
|
if not self.generalize_using_transform:
|
||||||
|
raise ValueError('transform method called even though generalize_using_transform parameter was False. This '
|
||||||
|
'can lead to inconsistent results.')
|
||||||
|
self.calculate_ncp(transformed, True)
|
||||||
|
return transformed
|
||||||
|
|
||||||
|
def calculate_ncp(self, samples: Optional[DATA_PANDAS_NUMPY_TYPE] = None, transformed: Optional[bool] = False):
|
||||||
|
"""
|
||||||
|
Compute the NCP score of the generalization. Calculation is based on the value of the
|
||||||
|
generalize_using_transform param.
|
||||||
|
Based on the NCP score presented in: Ghinita, G., Karras, P., Kalnis, P., Mamoulis, N.: Fast data anonymization
|
||||||
|
with low information loss (https://www.vldb.org/conf/2007/papers/research/p758-ghinita.pdf)
|
||||||
|
|
||||||
|
:param samples: The input samples to compute the NCP score on. Ideally should be the data that will be
|
||||||
|
transformed (e.g., test/runtime data). If not samples supplied, will return the last NCP score
|
||||||
|
computed by the `fit` or `transform` method.
|
||||||
|
:type samples: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
||||||
|
:param transformed: Whether the supplied samples have already been transformed using the `transform` method.
|
||||||
|
Default is False.
|
||||||
|
:type transformed: boolean, optional
|
||||||
|
:return: NCP score as float.
|
||||||
|
"""
|
||||||
|
if samples is None:
|
||||||
|
return self._ncp
|
||||||
|
elif self.generalize_using_transform:
|
||||||
|
if not transformed:
|
||||||
|
# transform data
|
||||||
|
transformed_data = self._inner_transform(samples)
|
||||||
|
else:
|
||||||
|
transformed_data = samples
|
||||||
|
#TODO
|
||||||
|
else: # use generalizations
|
||||||
|
# suppressed features are already taken care of within _calc_ncp_numeric
|
||||||
|
ranges = self.generalizations['ranges']
|
||||||
|
categories = self.generalizations['categories']
|
||||||
|
range_counts = self._find_range_count(samples, ranges)
|
||||||
|
category_counts = self._find_categories_count(samples, categories)
|
||||||
|
|
||||||
|
total = samples.shape[0]
|
||||||
|
total_ncp = 0
|
||||||
|
total_features = len(self.generalizations['untouched'])
|
||||||
|
for feature in ranges.keys():
|
||||||
|
feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
|
||||||
|
self._feature_data[feature], total)
|
||||||
|
total_ncp = total_ncp + feature_ncp
|
||||||
|
total_features += 1
|
||||||
|
for feature in categories.keys():
|
||||||
|
feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature],
|
||||||
|
self._feature_data[feature],
|
||||||
|
total)
|
||||||
|
total_ncp = total_ncp + feature_ncp
|
||||||
|
total_features += 1
|
||||||
|
if total_features == 0:
|
||||||
|
return 0
|
||||||
|
self._ncp = total_ncp / total_features
|
||||||
|
return self._ncp
|
||||||
|
|
||||||
|
def _inner_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
|
||||||
|
dataset: Optional[ArrayDataset] = None):
|
||||||
# Check if fit has been called
|
# Check if fit has been called
|
||||||
msg = 'This %(name)s instance is not initialized yet. ' \
|
msg = 'This %(name)s instance is not initialized yet. ' \
|
||||||
'Call ‘fit’ or ‘set_params’ with ' \
|
'Call ‘fit’ or ‘set_params’ with ' \
|
||||||
|
|
@ -409,12 +492,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
if not self._features:
|
if not self._features:
|
||||||
self._features = [i for i in range(x.shape[1])]
|
self._features = [i for i in range(x.shape[1])]
|
||||||
|
|
||||||
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
|
if self._dt: # only works if fit was called previously (but much more efficient)
|
||||||
all_indexes = []
|
nodes = self._get_nodes_level(self._level)
|
||||||
for i in range(len(self.cells)):
|
QI = x.loc[:, self.features_to_minimize]
|
||||||
indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)
|
used_x = x
|
||||||
all_indexes.append(indexes)
|
if self.train_only_features_to_minimize:
|
||||||
generalized = self._generalize_indexes(x, self.cells, all_indexes)
|
used_x = QI
|
||||||
|
prepared = self._encode_categorical_features(used_x)
|
||||||
|
generalized = self._generalize(x, prepared, nodes, self.cells, self._cells_by_id)
|
||||||
|
else:
|
||||||
|
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
|
||||||
|
all_indexes = []
|
||||||
|
for i in range(len(self.cells)):
|
||||||
|
indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)
|
||||||
|
all_indexes.append(indexes)
|
||||||
|
generalized = self._generalize_indexes(x, self.cells, all_indexes)
|
||||||
|
|
||||||
if dataset and dataset.is_pandas:
|
if dataset and dataset.is_pandas:
|
||||||
return generalized
|
return generalized
|
||||||
|
|
@ -422,6 +514,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
return generalized
|
return generalized
|
||||||
return generalized.to_numpy()
|
return generalized.to_numpy()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _calc_ncp_categorical(categories, category_count, feature_data, total):
|
||||||
|
category_sizes = [len(g) if len(g) > 1 else 0 for g in categories]
|
||||||
|
normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, category_count)]
|
||||||
|
average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes)
|
||||||
|
return average_group_size / feature_data['range'] # number of values in category
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _calc_ncp_numeric(feature_range, range_count, feature_data, total):
|
||||||
|
# if there are no ranges, feature is supressed and iLoss is 1
|
||||||
|
if not feature_range:
|
||||||
|
return 1
|
||||||
|
# range only contains the split values, need to add min and max value of feature
|
||||||
|
# to enable computing sizes of all ranges
|
||||||
|
new_range = [feature_data['min']] + feature_range + [feature_data['max']]
|
||||||
|
range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])]
|
||||||
|
normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)]
|
||||||
|
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
|
||||||
|
return average_range_size / (feature_data['max'] - feature_data['min'])
|
||||||
|
|
||||||
def _get_record_indexes_for_cell(self, X, cell, mapped):
|
def _get_record_indexes_for_cell(self, X, cell, mapped):
|
||||||
indexes = []
|
indexes = []
|
||||||
for index, row in X.iterrows():
|
for index, row in X.iterrows():
|
||||||
|
|
@ -429,20 +541,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
indexes.append(index)
|
indexes.append(index)
|
||||||
return indexes
|
return indexes
|
||||||
|
|
||||||
def _cell_contains(self, cell, x, i, mapped):
|
def _cell_contains(self, cell, x, index, mapped):
|
||||||
for f in self._features:
|
for f in self._features:
|
||||||
|
i = self._features.index(f)
|
||||||
if f in cell['ranges']:
|
if f in cell['ranges']:
|
||||||
if not self._cell_contains_numeric(f, cell['ranges'][f], x):
|
if not self._cell_contains_numeric(i, cell['ranges'][f], x):
|
||||||
return False
|
return False
|
||||||
elif f in cell['categories']:
|
elif f in cell['categories']:
|
||||||
if not self._cell_contains_categorical(f, cell['categories'][f], x):
|
if not self._cell_contains_categorical(i, cell['categories'][f], x):
|
||||||
return False
|
return False
|
||||||
elif f in cell['untouched']:
|
elif f in cell['untouched']:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
raise TypeError("feature " + f + "not found in cell" + cell['id'])
|
raise TypeError("feature " + f + "not found in cell" + cell['id'])
|
||||||
# Mark as mapped
|
# Mark as mapped
|
||||||
mapped.itemset(i, 1)
|
mapped.itemset(index, 1)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _encode_categorical_features(self, X, save_mapping=False):
|
def _encode_categorical_features(self, X, save_mapping=False):
|
||||||
|
|
@ -476,8 +589,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self._encoded_features = new_data.columns
|
self._encoded_features = new_data.columns
|
||||||
return new_data
|
return new_data
|
||||||
|
|
||||||
def _cell_contains_numeric(self, f, range, x):
|
@staticmethod
|
||||||
i = self._features.index(f)
|
def _cell_contains_numeric(i, range, x):
|
||||||
# convert x to ndarray to allow indexing
|
# convert x to ndarray to allow indexing
|
||||||
a = np.array(x)
|
a = np.array(x)
|
||||||
value = a.item(i)
|
value = a.item(i)
|
||||||
|
|
@ -489,8 +602,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _cell_contains_categorical(self, f, range, x):
|
@staticmethod
|
||||||
i = self._features.index(f)
|
def _cell_contains_categorical(i, range, x):
|
||||||
# convert x to ndarray to allow indexing
|
# convert x to ndarray to allow indexing
|
||||||
a = np.array(x)
|
a = np.array(x)
|
||||||
value = a.item(i)
|
value = a.item(i)
|
||||||
|
|
@ -819,7 +932,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self._remove_categorical_untouched(self._generalizations)
|
self._remove_categorical_untouched(self._generalizations)
|
||||||
|
|
||||||
def _find_range_count(self, samples, ranges):
|
def _find_range_count(self, samples, ranges):
|
||||||
samples_df = pd.DataFrame(samples, columns=self._encoded_features)
|
samples_df = pd.DataFrame(samples, columns=self._features)
|
||||||
range_counts = {}
|
range_counts = {}
|
||||||
last_value = None
|
last_value = None
|
||||||
for r in ranges.keys():
|
for r in ranges.keys():
|
||||||
|
|
@ -844,31 +957,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
category_counts[c].append(len(samples.loc[samples[c].isin(value)]))
|
category_counts[c].append(len(samples.loc[samples[c].isin(value)]))
|
||||||
return category_counts
|
return category_counts
|
||||||
|
|
||||||
def _calculate_ncp(self, samples, generalizations, feature_data):
|
|
||||||
# supressed features are already taken care of within _calc_ncp_numeric
|
|
||||||
ranges = generalizations['ranges']
|
|
||||||
categories = generalizations['categories']
|
|
||||||
range_counts = self._find_range_count(samples, ranges)
|
|
||||||
category_counts = self._find_categories_count(samples, categories)
|
|
||||||
|
|
||||||
total = samples.shape[0]
|
|
||||||
total_ncp = 0
|
|
||||||
total_features = len(generalizations['untouched'])
|
|
||||||
for feature in ranges.keys():
|
|
||||||
feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
|
|
||||||
feature_data[feature], total)
|
|
||||||
total_ncp = total_ncp + feature_ncp
|
|
||||||
total_features += 1
|
|
||||||
for feature in categories.keys():
|
|
||||||
featureNCP = self._calc_ncp_categorical(categories[feature], category_counts[feature],
|
|
||||||
feature_data[feature],
|
|
||||||
total)
|
|
||||||
total_ncp = total_ncp + featureNCP
|
|
||||||
total_features += 1
|
|
||||||
if total_features == 0:
|
|
||||||
return 0
|
|
||||||
return total_ncp / total_features
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _calculate_ranges(cells):
|
def _calculate_ranges(cells):
|
||||||
ranges = {}
|
ranges = {}
|
||||||
|
|
@ -942,26 +1030,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
untouched = untouched.intersection(*untouched_lists)
|
untouched = untouched.intersection(*untouched_lists)
|
||||||
return list(untouched)
|
return list(untouched)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _calc_ncp_categorical(categories, categoryCount, feature_data, total):
|
|
||||||
category_sizes = [len(g) if len(g) > 1 else 0 for g in categories]
|
|
||||||
normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, categoryCount)]
|
|
||||||
average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes)
|
|
||||||
return average_group_size / feature_data['range'] # number of values in category
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _calc_ncp_numeric(feature_range, range_count, feature_data, total):
|
|
||||||
# if there are no ranges, feature is supressed and iLoss is 1
|
|
||||||
if not feature_range:
|
|
||||||
return 1
|
|
||||||
# range only contains the split values, need to add min and max value of feature
|
|
||||||
# to enable computing sizes of all ranges
|
|
||||||
new_range = [feature_data['min']] + feature_range + [feature_data['max']]
|
|
||||||
range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])]
|
|
||||||
normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)]
|
|
||||||
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
|
|
||||||
return average_range_size / (feature_data['max'] - feature_data['min'])
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _remove_feature_from_cells(cells, cells_by_id, feature):
|
def _remove_feature_from_cells(cells, cells_by_id, feature):
|
||||||
for cell in cells:
|
for cell in cells:
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,33 @@ def test_minimizer_params(data):
|
||||||
gen.transform(dataset=ArrayDataset(X, features_names=features))
|
gen.transform(dataset=ArrayDataset(X, features_names=features))
|
||||||
|
|
||||||
|
|
||||||
|
def test_minimizer_params_not_transform(data):
|
||||||
|
# Assume two features, age and height, and boolean label
|
||||||
|
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": None, "end": 170}}, "label": 0,
|
||||||
|
'categories': {}, "representative": {"age": 26, "height": 149}},
|
||||||
|
{"id": 2, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": None, "end": 170}}, "label": 1,
|
||||||
|
'categories': {}, "representative": {"age": 58, "height": 163}},
|
||||||
|
{"id": 3, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": 171, "end": None}}, "label": 0,
|
||||||
|
'categories': {}, "representative": {"age": 31, "height": 184}},
|
||||||
|
{"id": 4, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": 171, "end": None}}, "label": 1,
|
||||||
|
'categories': {}, "representative": {"age": 45, "height": 176}}
|
||||||
|
]
|
||||||
|
features = ['age', 'height']
|
||||||
|
X = np.array([[23, 165],
|
||||||
|
[45, 158],
|
||||||
|
[18, 190]])
|
||||||
|
y = [1, 1, 0]
|
||||||
|
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||||
|
min_samples_leaf=1)
|
||||||
|
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||||
|
model.fit(ArrayDataset(X, y))
|
||||||
|
|
||||||
|
gen = GeneralizeToRepresentative(model, cells=cells)
|
||||||
|
gen.calculate_ncp(X)
|
||||||
|
ncp = gen.ncp
|
||||||
|
assert (ncp > 0.0)
|
||||||
|
|
||||||
|
|
||||||
def test_minimizer_fit(data):
|
def test_minimizer_fit(data):
|
||||||
features = ['age', 'height']
|
features = ['age', 'height']
|
||||||
X = np.array([[23, 165],
|
X = np.array([[23, 165],
|
||||||
|
|
@ -101,13 +128,62 @@ def test_minimizer_fit(data):
|
||||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||||
|
|
||||||
|
|
||||||
|
def test_minimizer_fit_not_transform(data):
|
||||||
|
features = ['age', 'height']
|
||||||
|
X = np.array([[23, 165],
|
||||||
|
[45, 158],
|
||||||
|
[56, 123],
|
||||||
|
[67, 154],
|
||||||
|
[45, 149],
|
||||||
|
[42, 166],
|
||||||
|
[73, 172],
|
||||||
|
[94, 168],
|
||||||
|
[69, 175],
|
||||||
|
[24, 181],
|
||||||
|
[18, 190]])
|
||||||
|
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||||
|
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||||
|
min_samples_leaf=1)
|
||||||
|
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||||
|
model.fit(ArrayDataset(X, y))
|
||||||
|
ad = ArrayDataset(X)
|
||||||
|
predictions = model.predict(ad)
|
||||||
|
if predictions.shape[1] > 1:
|
||||||
|
predictions = np.argmax(predictions, axis=1)
|
||||||
|
target_accuracy = 0.5
|
||||||
|
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
||||||
|
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
||||||
|
|
||||||
|
gen.fit(dataset=train_dataset)
|
||||||
|
gener = gen.generalizations
|
||||||
|
expected_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}
|
||||||
|
|
||||||
|
for key in expected_generalizations['ranges']:
|
||||||
|
assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
|
||||||
|
for key in expected_generalizations['categories']:
|
||||||
|
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
|
||||||
|
== set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||||
|
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
|
||||||
|
modified_features = [f for f in features if
|
||||||
|
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
|
||||||
|
'ranges'].keys()]
|
||||||
|
indexes = []
|
||||||
|
for i in range(len(features)):
|
||||||
|
if features[i] in modified_features:
|
||||||
|
indexes.append(i)
|
||||||
|
|
||||||
|
ncp = gen.ncp
|
||||||
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
|
assert (ncp > 0.0)
|
||||||
|
|
||||||
|
|
||||||
def test_minimizer_fit_pandas(data):
|
def test_minimizer_fit_pandas(data):
|
||||||
features = ['age', 'height', 'sex', 'ola']
|
features = ['age', 'height', 'sex', 'ola']
|
||||||
X = [[23, 165, 'f', 'aa'],
|
X = [[23, 165, 'f', 'aa'],
|
||||||
|
|
@ -172,7 +248,7 @@ def test_minimizer_fit_pandas(data):
|
||||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
|
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[modified_features]).equals(X[modified_features])) is False)
|
assert (((transformed[modified_features]).equals(X[modified_features])) is False)
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||||
|
|
@ -294,7 +370,7 @@ def test_minimizer_fit_QI(data):
|
||||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||||
|
|
@ -370,7 +446,7 @@ def test_minimizer_fit_pandas_QI(data):
|
||||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
|
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[modified_features]).equals(X[modified_features])) is False)
|
assert (((transformed[modified_features]).equals(X[modified_features])) is False)
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||||
|
|
@ -414,7 +490,7 @@ def test_minimize_ndarray_iris():
|
||||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
|
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||||
|
|
@ -492,7 +568,7 @@ def test_minimize_pandas_adult():
|
||||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
|
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[modified_features]).equals(x_train[modified_features])) is False)
|
assert (((transformed[modified_features]).equals(x_train[modified_features])) is False)
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||||
|
|
@ -570,7 +646,7 @@ def test_german_credit_pandas():
|
||||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
|
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[modified_features]).equals(x_train[modified_features])) is False)
|
assert (((transformed[modified_features]).equals(x_train[modified_features])) is False)
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||||
|
|
@ -644,7 +720,7 @@ def test_regression():
|
||||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
|
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||||
|
|
@ -698,7 +774,7 @@ def test_X_y(data):
|
||||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||||
|
|
@ -752,7 +828,7 @@ def test_X_y_features_names(data):
|
||||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||||
|
|
@ -826,7 +902,7 @@ def test_BaseEstimator_classification(data):
|
||||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
|
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[modified_features]).equals(X[modified_features])) is False)
|
assert (((transformed[modified_features]).equals(X[modified_features])) is False)
|
||||||
|
|
||||||
rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
|
rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
|
||||||
|
|
@ -899,7 +975,7 @@ def test_BaseEstimator_regression():
|
||||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
|
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
||||||
|
|
||||||
rel_accuracy = model.score(transformed, predictions)
|
rel_accuracy = model.score(transformed, predictions)
|
||||||
|
|
@ -940,7 +1016,7 @@ def test_keras_model():
|
||||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_test, indexes, axis=1)).all())
|
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_test, indexes, axis=1)).all())
|
||||||
ncp = gen.ncp
|
ncp = gen.ncp
|
||||||
if len(gener['ranges'].keys()) > 0 or len(gener['categories'].keys()) > 0:
|
if len(gener['ranges'].keys()) > 0 or len(gener['categories'].keys()) > 0:
|
||||||
assert (ncp > 0)
|
assert (ncp > 0.0)
|
||||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||||
|
|
||||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue