diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 65513b8..702b799 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -41,6 +41,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :param target_accuracy: The required relative accuracy when applying the base model to the generalized data. Accuracy is measured relative to the original accuracy of the model. :type target_accuracy: float, optional + :param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization process. + True means that the `transform` method is used to transform original data into + generalized data that is used for accuracy and NCP calculation. False indicates + that the `generalizations` structure should be used. Default is True. + :type generalize_using_transform: boolean, optional :param cells: The cells used to generalize records. Each cell must define a range or subset of categories for each feature, as well as a representative value for each feature. This parameter should be used when instantiating a transformer object without first fitting it. @@ -61,8 +66,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :type is_regression: boolean, optional """ - def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998, - cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None, + def __init__(self, estimator: Union[BaseEstimator, Model] = None, + target_accuracy: Optional[float] = 0.998, + generalize_using_transform: Optional[bool] = True, + cells: Optional[list] = None, + categorical_features: Optional[Union[np.ndarray, list]] = None, encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None, features_to_minimize: Optional[Union[np.ndarray, list]] = None, train_only_features_to_minimize: Optional[bool] = True, @@ -76,6 +84,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES) self.target_accuracy = target_accuracy self.cells = cells + if cells: + self._calculate_generalizations() self.categorical_features = [] if categorical_features: self.categorical_features = categorical_features @@ -83,6 +93,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.train_only_features_to_minimize = train_only_features_to_minimize self.is_regression = is_regression self.encoder = encoder + # self.generalize_using_transform = generalize_using_transform + self.generalize_using_transform = False + self._ncp = 0.0 + self._feature_data = {} + self._categorical_values = {} + self._dt = None + self._features = None + self._level = 0 def get_params(self, deep=True): """ @@ -99,6 +117,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM ret['features_to_minimize'] = self.features_to_minimize ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize ret['is_regression'] = self.is_regression + ret['generalize_using_transform'] = self.generalize_using_transform if deep: ret['cells'] = copy.deepcopy(self.cells) ret['estimator'] = self.estimator @@ -132,6 +151,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.is_regression = params['is_regression'] if 'cells' in params: self.cells = params['cells'] + if 'generalize_using_transform' in params: + self.generalize_using_transform = params['generalize_using_transform'] return self @property @@ -140,17 +161,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM Return the generalizations derived from the model and test data. :return: generalizations object. Contains 3 sections: 'ranges' that contains ranges for numerical features, - 'categories' that contains sub-groups of categories for categorical features, and - 'untouched' that contains the features that could not be generalized. + 'categories' that contains sub-groups of categories for categorical features, and + 'untouched' that contains the features that could not be generalized. """ return self._generalizations @property def ncp(self): """ - Return the NCP score of the generalizations. + Return the last calculated NCP score. NCP score is calculated upon calling `fit` (on the training data), + `transform' (on the test data) or when explicitly calling `calculate_ncp` and providing it a dataset. - :return: ncp score as float. + :return: NCP score as float. """ return self._ncp @@ -251,9 +273,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM used_X_test = X_test_QI # collect feature data (such as min, max) - feature_data = {} + self._feature_data = {} for feature in self._features: - if feature not in feature_data.keys(): + if feature not in self._feature_data.keys(): fd = {} values = list(x.loc[:, feature]) if feature not in self.categorical_features: @@ -262,7 +284,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM fd['range'] = max(values) - min(values) else: fd['range'] = len(np.unique(values)) - feature_data[feature] = fd + self._feature_data[feature] = fd # default encoder in case none provided if self.encoder is None: @@ -316,17 +338,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # if accuracy above threshold, improve generalization if accuracy > self.target_accuracy: print('Improving generalizations') - level = 1 + self._level = 1 while accuracy > self.target_accuracy: cells_previous_iter = self.cells generalization_prev_iter = self._generalizations cells_by_id_prev = self._cells_by_id - nodes = self._get_nodes_level(level) + nodes = self._get_nodes_level(self._level) try: - self._calculate_level_cells(level) + self._calculate_level_cells(self._level) except TypeError as e: print(e) + self._level -= 1 break self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes) @@ -340,10 +363,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.cells = cells_previous_iter self._generalizations = generalization_prev_iter self._cells_by_id = cells_by_id_prev + self._level -= 1 break else: - print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy)) - level += 1 + print('Pruned tree to level: %d, new relative accuracy: %f' % (self._level, accuracy)) + self._level += 1 # if accuracy below threshold, improve accuracy by removing features from generalization elif accuracy < self.target_accuracy: @@ -351,7 +375,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM while accuracy < self.target_accuracy: removed_feature = self._remove_feature_from_generalization(X_test, x_prepared_test, nodes, y_test, - feature_data, accuracy) + self._feature_data, accuracy) if removed_feature is None: break @@ -363,7 +387,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # self._cells currently holds the chosen generalization based on target accuracy # calculate iLoss - self._ncp = self._calculate_ncp(X_test, self._generalizations, feature_data) + self.calculate_ncp(X_test) # Return the transformer return self @@ -383,7 +407,66 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features) """ + transformed = self._inner_transform(X, features_names, dataset) + if not self.generalize_using_transform: + raise ValueError('transform method called even though generalize_using_transform parameter was False. This ' + 'can lead to inconsistent results.') + self.calculate_ncp(transformed, True) + return transformed + def calculate_ncp(self, samples: Optional[DATA_PANDAS_NUMPY_TYPE] = None, transformed: Optional[bool] = False): + """ + Compute the NCP score of the generalization. Calculation is based on the value of the + generalize_using_transform param. + Based on the NCP score presented in: Ghinita, G., Karras, P., Kalnis, P., Mamoulis, N.: Fast data anonymization + with low information loss (https://www.vldb.org/conf/2007/papers/research/p758-ghinita.pdf) + + :param samples: The input samples to compute the NCP score on. Ideally should be the data that will be + transformed (e.g., test/runtime data). If not samples supplied, will return the last NCP score + computed by the `fit` or `transform` method. + :type samples: {array-like, sparse matrix}, shape (n_samples, n_features), optional + :param transformed: Whether the supplied samples have already been transformed using the `transform` method. + Default is False. + :type transformed: boolean, optional + :return: NCP score as float. + """ + if samples is None: + return self._ncp + elif self.generalize_using_transform: + if not transformed: + # transform data + transformed_data = self._inner_transform(samples) + else: + transformed_data = samples + #TODO + else: # use generalizations + # suppressed features are already taken care of within _calc_ncp_numeric + ranges = self.generalizations['ranges'] + categories = self.generalizations['categories'] + range_counts = self._find_range_count(samples, ranges) + category_counts = self._find_categories_count(samples, categories) + + total = samples.shape[0] + total_ncp = 0 + total_features = len(self.generalizations['untouched']) + for feature in ranges.keys(): + feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], + self._feature_data[feature], total) + total_ncp = total_ncp + feature_ncp + total_features += 1 + for feature in categories.keys(): + feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature], + self._feature_data[feature], + total) + total_ncp = total_ncp + feature_ncp + total_features += 1 + if total_features == 0: + return 0 + self._ncp = total_ncp / total_features + return self._ncp + + def _inner_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None, + dataset: Optional[ArrayDataset] = None): # Check if fit has been called msg = 'This %(name)s instance is not initialized yet. ' \ 'Call ‘fit’ or ‘set_params’ with ' \ @@ -409,12 +492,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if not self._features: self._features = [i for i in range(x.shape[1])] - mapped = np.zeros(x.shape[0]) # to mark records we already mapped - all_indexes = [] - for i in range(len(self.cells)): - indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped) - all_indexes.append(indexes) - generalized = self._generalize_indexes(x, self.cells, all_indexes) + if self._dt: # only works if fit was called previously (but much more efficient) + nodes = self._get_nodes_level(self._level) + QI = x.loc[:, self.features_to_minimize] + used_x = x + if self.train_only_features_to_minimize: + used_x = QI + prepared = self._encode_categorical_features(used_x) + generalized = self._generalize(x, prepared, nodes, self.cells, self._cells_by_id) + else: + mapped = np.zeros(x.shape[0]) # to mark records we already mapped + all_indexes = [] + for i in range(len(self.cells)): + indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped) + all_indexes.append(indexes) + generalized = self._generalize_indexes(x, self.cells, all_indexes) if dataset and dataset.is_pandas: return generalized @@ -422,6 +514,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return generalized return generalized.to_numpy() + @staticmethod + def _calc_ncp_categorical(categories, category_count, feature_data, total): + category_sizes = [len(g) if len(g) > 1 else 0 for g in categories] + normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, category_count)] + average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes) + return average_group_size / feature_data['range'] # number of values in category + + @staticmethod + def _calc_ncp_numeric(feature_range, range_count, feature_data, total): + # if there are no ranges, feature is supressed and iLoss is 1 + if not feature_range: + return 1 + # range only contains the split values, need to add min and max value of feature + # to enable computing sizes of all ranges + new_range = [feature_data['min']] + feature_range + [feature_data['max']] + range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])] + normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)] + average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes) + return average_range_size / (feature_data['max'] - feature_data['min']) + def _get_record_indexes_for_cell(self, X, cell, mapped): indexes = [] for index, row in X.iterrows(): @@ -429,20 +541,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM indexes.append(index) return indexes - def _cell_contains(self, cell, x, i, mapped): + def _cell_contains(self, cell, x, index, mapped): for f in self._features: + i = self._features.index(f) if f in cell['ranges']: - if not self._cell_contains_numeric(f, cell['ranges'][f], x): + if not self._cell_contains_numeric(i, cell['ranges'][f], x): return False elif f in cell['categories']: - if not self._cell_contains_categorical(f, cell['categories'][f], x): + if not self._cell_contains_categorical(i, cell['categories'][f], x): return False elif f in cell['untouched']: continue else: raise TypeError("feature " + f + "not found in cell" + cell['id']) # Mark as mapped - mapped.itemset(i, 1) + mapped.itemset(index, 1) return True def _encode_categorical_features(self, X, save_mapping=False): @@ -476,8 +589,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._encoded_features = new_data.columns return new_data - def _cell_contains_numeric(self, f, range, x): - i = self._features.index(f) + @staticmethod + def _cell_contains_numeric(i, range, x): # convert x to ndarray to allow indexing a = np.array(x) value = a.item(i) @@ -489,8 +602,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return False return True - def _cell_contains_categorical(self, f, range, x): - i = self._features.index(f) + @staticmethod + def _cell_contains_categorical(i, range, x): # convert x to ndarray to allow indexing a = np.array(x) value = a.item(i) @@ -819,7 +932,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._remove_categorical_untouched(self._generalizations) def _find_range_count(self, samples, ranges): - samples_df = pd.DataFrame(samples, columns=self._encoded_features) + samples_df = pd.DataFrame(samples, columns=self._features) range_counts = {} last_value = None for r in ranges.keys(): @@ -844,31 +957,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM category_counts[c].append(len(samples.loc[samples[c].isin(value)])) return category_counts - def _calculate_ncp(self, samples, generalizations, feature_data): - # supressed features are already taken care of within _calc_ncp_numeric - ranges = generalizations['ranges'] - categories = generalizations['categories'] - range_counts = self._find_range_count(samples, ranges) - category_counts = self._find_categories_count(samples, categories) - - total = samples.shape[0] - total_ncp = 0 - total_features = len(generalizations['untouched']) - for feature in ranges.keys(): - feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], - feature_data[feature], total) - total_ncp = total_ncp + feature_ncp - total_features += 1 - for feature in categories.keys(): - featureNCP = self._calc_ncp_categorical(categories[feature], category_counts[feature], - feature_data[feature], - total) - total_ncp = total_ncp + featureNCP - total_features += 1 - if total_features == 0: - return 0 - return total_ncp / total_features - @staticmethod def _calculate_ranges(cells): ranges = {} @@ -942,26 +1030,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM untouched = untouched.intersection(*untouched_lists) return list(untouched) - @staticmethod - def _calc_ncp_categorical(categories, categoryCount, feature_data, total): - category_sizes = [len(g) if len(g) > 1 else 0 for g in categories] - normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, categoryCount)] - average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes) - return average_group_size / feature_data['range'] # number of values in category - - @staticmethod - def _calc_ncp_numeric(feature_range, range_count, feature_data, total): - # if there are no ranges, feature is supressed and iLoss is 1 - if not feature_range: - return 1 - # range only contains the split values, need to add min and max value of feature - # to enable computing sizes of all ranges - new_range = [feature_data['min']] + feature_range + [feature_data['max']] - range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])] - normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)] - average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes) - return average_range_size / (feature_data['max'] - feature_data['min']) - @staticmethod def _remove_feature_from_cells(cells, cells_by_id, feature): for cell in cells: diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index bd2f422..6cc5197 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -54,6 +54,33 @@ def test_minimizer_params(data): gen.transform(dataset=ArrayDataset(X, features_names=features)) +def test_minimizer_params_not_transform(data): + # Assume two features, age and height, and boolean label + cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": None, "end": 170}}, "label": 0, + 'categories': {}, "representative": {"age": 26, "height": 149}}, + {"id": 2, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": None, "end": 170}}, "label": 1, + 'categories': {}, "representative": {"age": 58, "height": 163}}, + {"id": 3, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": 171, "end": None}}, "label": 0, + 'categories': {}, "representative": {"age": 31, "height": 184}}, + {"id": 4, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": 171, "end": None}}, "label": 1, + 'categories': {}, "representative": {"age": 45, "height": 176}} + ] + features = ['age', 'height'] + X = np.array([[23, 165], + [45, 158], + [18, 190]]) + y = [1, 1, 0] + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(X, y)) + + gen = GeneralizeToRepresentative(model, cells=cells) + gen.calculate_ncp(X) + ncp = gen.ncp + assert (ncp > 0.0) + + def test_minimizer_fit(data): features = ['age', 'height'] X = np.array([[23, 165], @@ -101,13 +128,62 @@ def test_minimizer_fit(data): assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) +def test_minimizer_fit_not_transform(data): + features = ['age', 'height'] + X = np.array([[23, 165], + [45, 158], + [56, 123], + [67, 154], + [45, 149], + [42, 166], + [73, 172], + [94, 168], + [69, 175], + [24, 181], + [18, 190]]) + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(X, y)) + ad = ArrayDataset(X) + predictions = model.predict(ad) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + target_accuracy = 0.5 + gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy) + train_dataset = ArrayDataset(X, predictions, features_names=features) + + gen.fit(dataset=train_dataset) + gener = gen.generalizations + expected_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']} + + for key in expected_generalizations['ranges']: + assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expected_generalizations['categories']: + assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) + == set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expected_generalizations['untouched']) == set(gener['untouched'])) + modified_features = [f for f in features if + f in expected_generalizations['categories'].keys() or f in expected_generalizations[ + 'ranges'].keys()] + indexes = [] + for i in range(len(features)): + if features[i] in modified_features: + indexes.append(i) + + ncp = gen.ncp + if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: + assert (ncp > 0.0) + + def test_minimizer_fit_pandas(data): features = ['age', 'height', 'sex', 'ola'] X = [[23, 165, 'f', 'aa'], @@ -172,7 +248,7 @@ def test_minimizer_fit_pandas(data): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(X[modified_features])) is False) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) @@ -294,7 +370,7 @@ def test_minimizer_fit_QI(data): assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -370,7 +446,7 @@ def test_minimizer_fit_pandas_QI(data): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(X[modified_features])) is False) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) @@ -414,7 +490,7 @@ def test_minimize_ndarray_iris(): assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (x_train[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -492,7 +568,7 @@ def test_minimize_pandas_adult(): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(x_train[modified_features])) is False) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) @@ -570,7 +646,7 @@ def test_german_credit_pandas(): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(x_train[modified_features])) is False) rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions)) @@ -644,7 +720,7 @@ def test_regression(): assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (x_train[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -698,7 +774,7 @@ def test_X_y(data): assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -752,7 +828,7 @@ def test_X_y_features_names(data): assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions)) @@ -826,7 +902,7 @@ def test_BaseEstimator_classification(data): np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1)) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[modified_features]).equals(X[modified_features])) is False) rel_accuracy = model.score(preprocessor.transform(transformed), predictions) @@ -899,7 +975,7 @@ def test_BaseEstimator_regression(): assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all()) ncp = gen.ncp if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (x_train[indexes])).any()) rel_accuracy = model.score(transformed, predictions) @@ -940,7 +1016,7 @@ def test_keras_model(): assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_test, indexes, axis=1)).all()) ncp = gen.ncp if len(gener['ranges'].keys()) > 0 or len(gener['categories'].keys()) > 0: - assert (ncp > 0) + assert (ncp > 0.0) assert (((transformed[indexes]) != (X[indexes])).any()) rel_accuracy = model.score(ArrayDataset(transformed, predictions))