diff --git a/apt/minimization/README.md b/apt/minimization/README.md index 831f5d0..0f19ede 100644 --- a/apt/minimization/README.md +++ b/apt/minimization/README.md @@ -32,8 +32,7 @@ them to new data. It is also possible to export the generalizations as feature ranges. -The current implementation supports only numeric features, so any categorical features must be transformed to a numeric -representation before using this class. +The current implementation supports numeric features and categorical features. Start by training your machine learning model. In this example, we will use a ``DecisionTreeClassifier``, but any scikit-learn model can be used. We will use the iris dataset in our example. diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 5ab5c76..3d7af45 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -1,18 +1,22 @@ """ This module implements all classes needed to perform data minimization """ - +from typing import Union import pandas as pd import numpy as np import copy import sys from scipy.spatial import distance from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin -from sklearn.base import clone +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split + class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin): """ A transformer that generalizes data to representative points. @@ -44,6 +48,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM features : list of str, optional The feature names, in the order that they appear in the data. + categorical_features: list of str, optional + The list of categorical features should only be supplied when + passing data as a pandas dataframe. + cells : list of object, optional The cells used to generalize records. Each cell must define a range or subset of categories for each feature, as well as a @@ -70,11 +78,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM """ def __init__(self, estimator=None, target_accuracy=0.998, features=None, - cells=None): + cells=None, categorical_features=None): self.estimator = estimator self.target_accuracy = target_accuracy self.features = features self.cells = cells + self.categorical_features = [] + if categorical_features: + self.categorical_features = categorical_features + self.is_numpy = True def get_params(self, deep=True): """Get parameters for this estimator. @@ -121,7 +133,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def generalizations(self): return self.generalizations_ - def fit_transform(self, X=None, y=None): + def fit_transform(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None): """Learns the generalizations based on training data, and applies them to the data. Parameters @@ -134,13 +146,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM Returns ------- - self : object - Returns self. + X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features) + The array containing the representative values to which each record in + ``X`` is mapped. """ self.fit(X, y) return self.transform(X) - def fit(self, X=None, y=None): + def fit(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None): """Learns the generalizations based on training data. Parameters @@ -153,15 +166,21 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM Returns ------- - X_transformed : ndarray, shape (n_samples, n_features) + X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features) The array containing the representative values to which each record in ``X`` is mapped. """ # take into account that estimator, X, y, cells, features may be None + if X is not None: + if type(X) == np.ndarray: + self.is_numpy = True + else: + self.is_numpy = False if X is not None and y is not None: - X, y = check_X_y(X, y, accept_sparse=True) + if self.is_numpy: + X, y = check_X_y(X, y, accept_sparse=True) self.n_features_ = X.shape[1] elif self.features: self.n_features_ = len(self.features) @@ -180,6 +199,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.cells_ = self.cells else: self.cells_ = {} + self.categorical_values = {} + + if self.is_numpy: + X = pd.DataFrame(X, columns=self._features) # Going to fit @@ -187,36 +210,67 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if self.estimator and X is not None and y is not None: # divide dataset into train and test X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, - test_size = 0.4, - random_state = 18) + test_size=0.4, + random_state=18) # collect feature data (such as min, max) - train_data = pd.DataFrame(X_train, columns=self._features) + feature_data = {} for feature in self._features: - if not feature in feature_data.keys(): - values = list(train_data.loc[:, feature]) + if feature not in feature_data.keys(): fd = {} - fd['min'] = min(values) - fd['max'] = max(values) + values = list(X.loc[:, feature]) + if feature not in self.categorical_features: + fd['min'] = min(values) + fd['max'] = max(values) + fd['range'] = max(values) - min(values) + else: + fd['range'] = len(values) feature_data[feature] = fd + # prepare data for DT + categorical_features = list(self.categorical_features) + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + + # numeric_features = list(self._features) - list(self.categorical_features) + numeric_features = [item for item in self._features if item not in self.categorical_features] + categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) + + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + preprocessor.fit(X) + + x_prepared = preprocessor.transform(X_train) + self.preprocessor = preprocessor self.cells_ = {} self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2, min_samples_leaf=1) - self.dt_.fit(X_train, y_train) + self.dt_.fit(x_prepared, y_train) + self._modify_categorical_features(X) + + x_prepared = pd.DataFrame(x_prepared, columns=self.categorical_data.columns) + self._calculate_cells() self._modify_cells() nodes = self._get_nodes_level(0) - self._attach_cells_representatives(X_train, y_train, nodes) + self._attach_cells_representatives(x_prepared, X_train, y_train, nodes) # self.cells_ currently holds the generalization created from the tree leaves self._calculate_generalizations() # apply generalizations to test data - generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_) + x_prepared_test = preprocessor.transform(X_test) + x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self.categorical_data.columns) + + generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_) # check accuracy - accuracy = self.estimator.score(generalized, y_test) + accuracy = self.estimator.score(preprocessor.transform(generalized), y_test) print('Initial accuracy of model on generalized data, relative to original model predictions ' '(base generalization derived from tree, before improvements): %f' % accuracy) @@ -225,29 +279,44 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM print('Improving generalizations') level = 1 while accuracy > self.target_accuracy: - nodes = self._get_nodes_level(level) - self._calculate_level_cells(level) - self._attach_cells_representatives(X_train, y_train, nodes) - self._calculate_generalizations() - generalized = self._generalize(X_test, nodes, self.cells_, - self.cells_by_id_) - accuracy = self.estimator.score(generalized, y_test) - print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy)) - level+=1 + try: + cells_previous_iter = self.cells_ + generalization_prev_iter = self.generalizations_ + cells_by_id_prev = self.cells_by_id_ + nodes = self._get_nodes_level(level) + self._calculate_level_cells(level) + + self._attach_cells_representatives(x_prepared, X_train, y_train, nodes) + self._calculate_generalizations() + generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, + self.cells_by_id_) + accuracy = self.estimator.score(preprocessor.transform(generalized), y_test) + # if accuracy passed threshold roll back to previous iteration generalizations + if accuracy < self.target_accuracy: + self.cells_ = cells_previous_iter + self.generalizations_ = generalization_prev_iter + self.cells_by_id_ = cells_by_id_prev + break + else: + print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy)) + level += 1 + except Exception as e: + print(e) + break # if accuracy below threshold, improve accuracy by removing features from generalization - if accuracy < self.target_accuracy: + elif accuracy < self.target_accuracy: print('Improving accuracy') while accuracy < self.target_accuracy: - removed_feature = self._remove_feature_from_generalization(X_test, + removed_feature = self._remove_feature_from_generalization(X_test, x_prepared_test, nodes, y_test, feature_data, accuracy) if removed_feature is None: break self._calculate_generalizations() - generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_) - accuracy = self.estimator.score(generalized, y_test) + generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_) + accuracy = self.estimator.score(preprocessor.transform(generalized), y_test) print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy)) # self.cells_ currently holds the chosen generalization based on target accuracy @@ -258,17 +327,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # Return the transformer return self - def transform(self, X): + def transform(self, X: Union[np.ndarray, pd.DataFrame]): """ Transforms data records to representative points. Parameters ---------- - X : {array-like, sparse-matrix}, shape (n_samples, n_features) + X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe, + may contain both numeric and categorical data. The input samples. Returns ------- - X_transformed : ndarray, shape (n_samples, n_features) + X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features) The array containing the representative values to which each record in ``X`` is mapped. """ @@ -279,8 +349,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM 'appropriate arguments before using this method.' check_is_fitted(self, ['cells', 'features'], msg=msg) - # Input validation - X = check_array(X, accept_sparse=True) + if type(X) == np.ndarray: + # Input validation + X = check_array(X, accept_sparse=True) + self.is_numpy = True + X = pd.DataFrame(X, columns=self._features) + else: + self.is_numpy = False + + if X.shape[1] != self.n_features_ and self.n_features_ != 0: raise ValueError('Shape of input is different from what was seen' 'in `fit`') @@ -300,8 +377,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # if feature has a representative value in the cell and should not # be left untouched, take the representative value if feature in self.cells_[i]['representative'] and \ - ( 'untouched' not in self.cells_[i] \ - or feature not in self.cells_[i]['untouched'] ): + ('untouched' not in self.cells_[i] + or feature not in self.cells_[i]['untouched']): representatives.loc[i, feature] = self.cells_[i]['representative'][feature] # else, drop the feature (removes from representatives columns that # do not have a representative value or should remain untouched) @@ -315,30 +392,57 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # values (leaves others untouched) if indexes and not representatives.columns.empty: if len(indexes) > 1: - replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True) + replace = pd.concat([representatives.loc[i].to_frame().T] * len(indexes)).reset_index(drop=True) else: replace = representatives.loc[i].to_frame().T.reset_index(drop=True) replace.index = indexes generalized.loc[indexes, representatives.columns] = replace - - return generalized.to_numpy() + if self.is_numpy: + return generalized.to_numpy() + return generalized def _get_record_indexes_for_cell(self, X, cell, mapped): - return [i for i, x in enumerate(X) if not mapped.item(i) and - self._cell_contains(cell, x, i, mapped)] + indexes = [] + for index, row in X.iterrows(): + if not mapped.item(index) and self._cell_contains(cell, row, index, mapped): + indexes.append(index) + return indexes def _cell_contains(self, cell, x, i, mapped): for f in self._features: if f in cell['ranges']: if not self._cell_contains_numeric(f, cell['ranges'][f], x): return False + elif f in cell['categories']: + if not self._cell_contains_categorical(f, cell['categories'][f], x): + return False + elif f in cell['untouched']: + continue else: - #TODO: exception - feature not defined - pass + raise TypeError("feature " + f + "not found in cell" + cell['id']) # Mark as mapped mapped.itemset(i, 1) return True + def _modify_categorical_features(self, X): + self.categorical_values = {} + self.oneHotVectorFeaturesToFeatures = {} + features_to_remove = [] + for feature in self.categorical_features: + try: + all_values = X.loc[:, feature] + values = list(all_values.unique()) + self.categorical_values[feature] = values + X[feature] = pd.Categorical(X.loc[:, feature], categories=values, ordered=False) + ohe = pd.get_dummies(X[feature], prefix=feature) + for oneHotVectorFeature in ohe.columns: + self.oneHotVectorFeaturesToFeatures[oneHotVectorFeature] = feature + X = pd.concat([X, ohe], axis=1) + features_to_remove.append(feature) + except KeyError: + print("feature " + feature + "not found in training data") + self.categorical_data = X.drop(features_to_remove, axis=1) + def _cell_contains_numeric(self, f, range, x): i = self._features.index(f) # convert x to ndarray to allow indexing @@ -352,6 +456,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return False return True + def _cell_contains_categorical(self, f, range, x): + i = self._features.index(f) + # convert x to ndarray to allow indexing + a = np.array(x) + value = a.item(i) + if value in range: + return True + return False + def _calculate_cells(self): self.cells_by_id_ = {} self.cells_ = self._calculate_cells_recursive(0) @@ -366,7 +479,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return [cell] cells = [] - feature = self._features[feature_index] + feature = self.categorical_data.columns[feature_index] threshold = self.dt_.tree_.threshold[node] left_child = self.dt_.tree_.children_left[node] right_child = self.dt_.tree_.children_right[node] @@ -397,22 +510,38 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def _modify_cells(self): cells = [] + features = self.categorical_data.columns for cell in self.cells_: - new_cell = {'id': cell['id'], 'label': cell['label'], 'ranges': {}, - 'categories': {}, 'hist': cell['hist'], 'representative': None} - for feature in self._features: - if feature in cell['ranges'].keys(): - new_cell['ranges'][feature] = cell['ranges'][feature] + new_cell = {'id': cell['id'], 'label': cell['label'], 'ranges': {}, 'categories': {}, 'hist': cell['hist'], + 'representative': None} + for feature in features: + if feature in self.oneHotVectorFeaturesToFeatures.keys(): + # feature is categorical and should be mapped + categorical_feature = self.oneHotVectorFeaturesToFeatures[feature] + if categorical_feature not in new_cell['categories'].keys(): + new_cell['categories'][categorical_feature] = self.categorical_values[ + categorical_feature].copy() + if feature in cell['ranges'].keys(): + categorical_value = feature[len(categorical_feature) + 1:] + if cell['ranges'][feature]['start'] is not None: + # categorical feature must have this value + new_cell['categories'][categorical_feature] = [categorical_value] + else: + # categorical feature can not have this value + if categorical_value in new_cell['categories'][categorical_feature]: + new_cell['categories'][categorical_feature].remove(categorical_value) else: - new_cell['ranges'][feature] = {'start': None, 'end': None} + if feature in cell['ranges'].keys(): + new_cell['ranges'][feature] = cell['ranges'][feature] + else: + new_cell['ranges'][feature] = {'start': None, 'end': None} cells.append(new_cell) self.cells_by_id_[new_cell['id']] = new_cell self.cells_ = cells def _calculate_level_cells(self, level): if level < 0 or level > self.dt_.get_depth(): - #TODO: exception 'Illegal level %d' % level - pass + raise TypeError("Illegal level %d' % level", level) if level > 0: new_cells = [] @@ -420,7 +549,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM nodes = self._get_nodes_level(level) if nodes: for node in nodes: - if self.dt_.tree_.feature[node] == -2: # leaf node + if self.dt_.tree_.feature[node] == -2: # leaf node new_cell = self.cells_by_id_[node] else: left_child = self.dt_.tree_.children_left[node] @@ -474,23 +603,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # return all nodes with depth == level or leaves higher than level return [i for i, x in enumerate(node_depth) if x == depth or (x < depth and is_leaves[i])] - def _attach_cells_representatives(self, samples, labels, level_nodes): - samples_df = pd.DataFrame(samples, columns=self._features) - labels_df = pd.DataFrame(labels, columns=['label']) - samples_node_ids = self._find_sample_nodes(samples_df, level_nodes) + def _attach_cells_representatives(self, prepared_data, originalTrainFeatures, labelFeature, level_nodes): + # prepared data include one hot encoded categorical data, + # if there is no categorical data prepared data is original data + nodeIds = self._find_sample_nodes(prepared_data, level_nodes) + labels_df = pd.DataFrame(labelFeature, columns=['label']) for cell in self.cells_: cell['representative'] = {} # get all rows in cell - indexes = [i for i, x in enumerate(samples_node_ids) if x == cell['id']] - sample_rows = samples_df.iloc[indexes] + indexes = [i for i, x in enumerate(nodeIds) if x == cell['id']] + original_rows = originalTrainFeatures.iloc[indexes] + sample_rows = prepared_data.iloc[indexes] sample_labels = labels_df.iloc[indexes]['label'].values.tolist() # get rows with matching label indexes = [i for i, label in enumerate(sample_labels) if label == cell['label']] match_samples = sample_rows.iloc[indexes] + match_rows = original_rows.iloc[indexes] # find the "middle" of the cluster array = match_samples.values + # Only works with numpy 1.9.0 and higher!!! median = np.median(array, axis=0) - # find the record closest to the median i = 0 min = len(array) min_dist = float("inf") @@ -500,19 +632,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM min_dist = dist min = i i = i + 1 - row = match_samples.iloc[min] - # use its values as the representative + row = match_rows.iloc[min] for feature in cell['ranges'].keys(): - cell['representative'][feature] = row[feature].item() + cell['representative'][feature] = row[feature] + for feature in cell['categories'].keys(): + cell['representative'][feature] = row[feature] def _find_sample_nodes(self, samples, nodes): paths = self.dt_.decision_path(samples).toarray() nodeSet = set(nodes) return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths] - def _generalize(self, data, level_nodes, cells, cells_by_id): - representatives = pd.DataFrame(columns=self._features) # empty except for columns - generalized = pd.DataFrame(data, columns=self._features, copy=True) # original data + def _generalize(self, original_data, prepared_data, level_nodes, cells, cells_by_id): + # prepared data include one hot encoded categorical data + QI + representatives = pd.DataFrame(columns=self._features) # empty except for columns + generalized = pd.DataFrame(prepared_data, columns=self.categorical_data.columns, copy=True) + original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True) mapping_to_cells = self._map_to_cells(generalized, level_nodes, cells_by_id) # iterate over cells (leaves in decision tree) for i in range(len(cells)): @@ -530,32 +665,39 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM representatives = representatives.drop(feature, axis=1) # get the indexes of all records that map to this cell - indexes = [j for j in range(len(mapping_to_cells)) if mapping_to_cells[j]['id'] == cells[i]['id']] + indexes = [j for j in mapping_to_cells if mapping_to_cells[j]['id'] == cells[i]['id']] + # replaces the values in the representative columns with the representative values # (leaves others untouched) if indexes and not representatives.columns.empty: if len(indexes) > 1: - replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True) + replace = pd.concat([representatives.loc[i].to_frame().T] * len(indexes)).reset_index(drop=True) else: replace = representatives.loc[i].to_frame().T.reset_index(drop=True) replace.index = indexes - generalized.loc[indexes, representatives.columns] = replace + # replace = self.preprocessor.transform(replace) + replace = pd.DataFrame(replace, indexes, columns=self._features) + original_data_generalized.loc[indexes, representatives.columns.tolist()] = replace - return generalized.to_numpy() + return original_data_generalized def _map_to_cells(self, samples, nodes, cells_by_id): - mapping_to_cells = [] + mapping_to_cells = {} for index, row in samples.iterrows(): cell = self._find_sample_cells([row], nodes, cells_by_id)[0] - mapping_to_cells.append(cell) + mapping_to_cells[index] = cell return mapping_to_cells def _find_sample_cells(self, samples, nodes, cells_by_id): node_ids = self._find_sample_nodes(samples, nodes) return [cells_by_id[nodeId] for nodeId in node_ids] - def _remove_feature_from_generalization(self, samples, nodes, labels, feature_data, current_accuracy): - feature = self._get_feature_to_remove(samples, nodes, labels, feature_data, current_accuracy) + def _remove_feature_from_generalization(self, original_data, prepared_data, nodes, labels, feature_data, + current_accuracy): + # prepared data include one hot encoded categorical data, + # if there is no categorical data prepared data is original data + feature = self._get_feature_to_remove(original_data, prepared_data, nodes, labels, feature_data, + current_accuracy) if feature is None: return None GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature) @@ -563,14 +705,18 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # self.generalizations_['untouched'].append(feature) return feature - def _get_feature_to_remove(self, samples, nodes, labels, feature_data, current_accuracy): + def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy): + # prepared data include one hot encoded categorical data, + # if there is no categorical data prepared data is original data # We want to remove features with low iLoss (NCP) and high accuracy gain # (after removing them) ranges = self.generalizations_['ranges'] - range_counts = self._find_range_count(samples, ranges) - total = samples.size + range_counts = self._find_range_count(original_data, ranges) + total = prepared_data.size range_min = sys.float_info.max remove_feature = None + categories = self.generalizations['categories'] + category_counts = self._find_categories_count(original_data, categories) for feature in ranges.keys(): if feature not in self.generalizations_['untouched']: @@ -583,8 +729,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM new_cells = copy.deepcopy(self.cells_) cells_by_id = copy.deepcopy(self.cells_by_id_) GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) - generalized = self._generalize(samples, nodes, new_cells, cells_by_id) - accuracy_gain = self.estimator.score(generalized, labels) - current_accuracy + generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id) + accuracy_gain = self.estimator.score(self.preprocessor.transform(generalized), + labels) - current_accuracy if accuracy_gain < 0: accuracy_gain = 0 if accuracy_gain != 0: @@ -594,15 +741,39 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM range_min = feature_ncp remove_feature = feature + for feature in categories.keys(): + if feature not in self.generalizations['untouched']: + feature_ncp = self._calc_ncp_categorical(categories[feature], + category_counts[feature], + feature_data[feature], + total) + if feature_ncp > 0: + # divide by accuracy loss + new_cells = copy.deepcopy(self.cells_) + cells_by_id = copy.deepcopy(self.cells_by_id_) + GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) + generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id) + accuracy_gain = self.estimator.score(self.preprocessor.transform(generalized), + labels) - current_accuracy + + if accuracy_gain < 0: + accuracy_gain = 0 + if accuracy_gain != 0: + feature_ncp = feature_ncp / accuracy_gain + if feature_ncp < range_min: + range_min = feature_ncp + remove_feature = feature + print('feature to remove: ' + (str(remove_feature) if remove_feature is not None else 'none')) return remove_feature def _calculate_generalizations(self): self.generalizations_ = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells_), - 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells_)} + 'categories': GeneralizeToRepresentative._calculate_categories(self.cells_), + 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells_)} def _find_range_count(self, samples, ranges): - samples_df = pd.DataFrame(samples, columns=self._features) + samples_df = pd.DataFrame(samples, columns=self.categorical_data.columns) range_counts = {} last_value = None for r in ranges.keys(): @@ -612,22 +783,42 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM range_counts[r].append(samples_df.shape[0]) else: for value in ranges[r]: - range_counts[r].append(len(samples_df.loc[samples_df[r] <= value])) + counter = [item for item in samples_df[r] if int(item) <= value] + range_counts[r].append(len(counter)) last_value = value - range_counts[r].append(len(samples_df.loc[samples_df[r] > last_value])) + counter = [item for item in samples_df[r] if int(item) <= last_value] + range_counts[r].append(len(counter)) return range_counts + def _find_categories_count(self, samples, categories): + category_counts = {} + for c in categories.keys(): + category_counts[c] = [] + for value in categories[c]: + category_counts[c].append(len(samples.loc[samples[c].isin(value)])) + return category_counts + def _calculate_ncp(self, samples, generalizations, feature_data): # supressed features are already taken care of within _calc_ncp_numeric ranges = generalizations['ranges'] + categories = generalizations['categories'] range_counts = self._find_range_count(samples, ranges) + category_counts = self._find_categories_count(samples, categories) + total = samples.shape[0] total_ncp = 0 total_features = len(generalizations['untouched']) for feature in ranges.keys(): - feature_ncp = GeneralizeToRepresentative._calc_ncp_numeric(ranges[feature], range_counts[feature], feature_data[feature], total) + feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], + feature_data[feature], total) total_ncp = total_ncp + feature_ncp total_features += 1 + for feature in categories.keys(): + featureNCP = self._calc_ncp_categorical(categories[feature], category_counts[feature], + feature_data[feature], + total) + total_ncp = total_ncp + featureNCP + total_features += 1 if total_features == 0: return 0 return total_ncp / total_features @@ -649,6 +840,55 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM ranges[feature].sort() return ranges + @staticmethod + def _calculate_categories(cells): + categories = {} + categorical_features_values = GeneralizeToRepresentative._calculate_categorical_features_values(cells) + for feature in categorical_features_values.keys(): + partitions = [] + values = categorical_features_values[feature] + assigned = [] + for i in range(len(values)): + value1 = values[i] + if value1 in assigned: + continue + partition = [value1] + assigned.append(value1) + for j in range(len(values)): + if j <= i: + continue + value2 = values[j] + if GeneralizeToRepresentative._are_inseparable(cells, feature, value1, value2): + partition.append(value2) + assigned.append(value2) + partitions.append(partition) + categories[feature] = partitions + return categories + + @staticmethod + def _calculate_categorical_features_values(cells): + categorical_features_values = {} + for cell in cells: + for feature in [key for key in cell['categories'].keys() if + 'untouched' not in cell or key not in cell['untouched']]: + if feature not in categorical_features_values.keys(): + categorical_features_values[feature] = [] + for value in cell['categories'][feature]: + if value not in categorical_features_values[feature]: + categorical_features_values[feature].append(value) + return categorical_features_values + + @staticmethod + def _are_inseparable(cells, feature, value1, value2): + for cell in cells: + if feature not in cell['categories'].keys(): + continue + value1_in = value1 in cell['categories'][feature] + value2_in = value2 in cell['categories'][feature] + if value1_in != value2_in: + return False + return True + @staticmethod def _calculate_untouched(cells): untouched_lists = [cell['untouched'] if 'untouched' in cell else [] for cell in cells] @@ -656,6 +896,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM untouched = untouched.intersection(*untouched_lists) return list(untouched) + @staticmethod + def _calc_ncp_categorical(categories, categoryCount, feature_data, total): + category_sizes = [len(g) if len(g) > 1 else 0 for g in categories] + normalized_category_sizes = [s * n / total for s, n in zip(category_sizes, categoryCount)] + average_group_size = sum(normalized_category_sizes) / len(normalized_category_sizes) + return average_group_size / feature_data['range'] # number of values in category + @staticmethod def _calc_ncp_numeric(feature_range, range_count, feature_data, total): # if there are no ranges, feature is supressed and iLoss is 1 @@ -669,7 +916,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes) return average_range_size / (feature_data['max'] - feature_data['min']) - @staticmethod def _remove_feature_from_cells(cells, cells_by_id, feature): for cell in cells: diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 9cb59e1..8dd2244 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -1,40 +1,53 @@ import pytest import numpy as np +import pandas as pd +from sklearn.compose import ColumnTransformer from sklearn.datasets import load_boston +from sklearn.impute import SimpleImputer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler from apt.minimization import GeneralizeToRepresentative from sklearn.tree import DecisionTreeClassifier +from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset @pytest.fixture def data(): return load_boston(return_X_y=True) + def test_minimizer_params(data): # Assume two features, age and height, and boolean label cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": None, "end": 170}}, "label": 0, - "representative": {"age": 26, "height": 149}}, + 'categories': {}, "representative": {"age": 26, "height": 149}}, {"id": 2, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": None, "end": 170}}, "label": 1, - "representative": {"age": 58, "height": 163}}, + 'categories': {}, "representative": {"age": 58, "height": 163}}, {"id": 3, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": 171, "end": None}}, "label": 0, - "representative": {"age": 31, "height": 184}}, + 'categories': {}, "representative": {"age": 31, "height": 184}}, {"id": 4, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": 171, "end": None}}, "label": 1, - "representative": {"age": 45, "height": 176}} + 'categories': {}, "representative": {"age": 45, "height": 176}} ] features = ['age', 'height'] X = np.array([[23, 165], - [45, 158], - [18, 190]]) - print(X.dtype) - y = [1,1,0] - base_est = DecisionTreeClassifier() + [45, 158], + [18, 190]]) + y = [1, 1, 0] + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) base_est.fit(X, y) gen = GeneralizeToRepresentative(base_est, features=features, cells=cells) gen.fit() transformed = gen.transform(X) - print(transformed) + expected_transformed = np.array([[26, 149], + [58, 163], + [31, 184]]) + assert(np.array_equal(expected_transformed, transformed)) + def test_minimizer_fit(data): features = ['age', 'height'] @@ -51,14 +64,318 @@ def test_minimizer_fit(data): [18, 190]]) print(X) y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] - base_est = DecisionTreeClassifier() + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) base_est.fit(X, y) predictions = base_est.predict(X) gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5) gen.fit(X, predictions) transformed = gen.transform(X) - print(X) - print(transformed) + gener = gen.generalizations_ + expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['age', 'height']} + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + modified_features = [f for f in features if + f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ + 'ranges'].keys()] + indexes = [] + for i in range(len(features)): + if features[i] in modified_features: + indexes.append(i) + assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all()) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[indexes]) != (X[indexes])).any()) +def test_minimizer_fit_pandas(data): + features = ['age', 'height', 'sex', 'ola'] + X = [[23, 165, 'f', 'aa'], + [45, 158, 'f', 'aa'], + [56, 123, 'f', 'bb'], + [67, 154, 'm', 'aa'], + [45, 149, 'f', 'bb'], + [42, 166, 'm', 'bb'], + [73, 172, 'm', 'bb'], + [94, 168, 'f', 'aa'], + [69, 175, 'm', 'aa'], + [24, 181, 'm', 'bb'], + [18, 190, 'm', 'bb']] + y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] + X = pd.DataFrame(X, columns=features) + + numeric_features = ["age", "height"] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + + categorical_features = ["sex", "ola"] + categorical_transformer = OneHotEncoder(handle_unknown="ignore") + + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(X) + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + base_est.fit(encoded, y) + predictions = base_est.predict(encoded) + # Append classifier to preprocessing pipeline. + # Now we have a full prediction pipeline. + gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, + categorical_features=categorical_features) + gen.fit(X, predictions) + transformed = gen.transform(X) + gener = gen.generalizations_ + expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['sex', 'height', 'ola']} + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + modified_features = [f for f in features if + f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ + 'ranges'].keys()] + assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1))) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[modified_features]).equals(X[modified_features])) == False) + + +def test_minimizer_params_categorical(data): + # Assume three features, age, sex and height, and boolean label + cells = [{'id': 1, 'label': 0, 'ranges': {'age': {'start': None, 'end': None}}, + 'categories': {'sex': ['f', 'm']}, 'hist': [2, 0], + 'representative': {'age': 45, 'height': 149, 'sex': 'f'}, + 'untouched': ['height']}, + {'id': 3, 'label': 1, 'ranges': {'age': {'start': None, 'end': None}}, + 'categories': {'sex': ['f', 'm']}, 'hist': [0, 3], + 'representative': {'age': 23, 'height': 165, 'sex': 'f'}, + 'untouched': ['height']}, + {'id': 4, 'label': 0, 'ranges': {'age': {'start': None, 'end': None}}, + 'categories': {'sex': ['f', 'm']}, 'hist': [1, 0], + 'representative': {'age': 18, 'height': 190, 'sex': 'm'}, + 'untouched': ['height']} + ] + + features = ['age', 'height', 'sex'] + X = [[23, 165, 'f'], + [45, 158, 'f'], + [56, 123, 'f'], + [67, 154, 'm'], + [45, 149, 'f'], + [42, 166, 'm'], + [73, 172, 'm'], + [94, 168, 'f'], + [69, 175, 'm'], + [24, 181, 'm'], + [18, 190, 'm']] + + y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] + X = pd.DataFrame(X, columns=features) + numeric_features = ["age", "height"] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + + categorical_features = ["sex"] + categorical_transformer = OneHotEncoder(handle_unknown="ignore") + + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(X) + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + base_est.fit(encoded, y) + predictions = base_est.predict(encoded) + # Append classifier to preprocessing pipeline. + # Now we have a full prediction pipeline. + gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, + categorical_features=categorical_features) + gen.fit(X, predictions) + transformed = gen.transform(X) + gener = gen.generalizations_ + expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['height', 'sex']} + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + modified_features = [f for f in features if + f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ + 'ranges'].keys()] + assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1))) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[modified_features]).equals(X[modified_features])) == False) + + +def test_minimize_ndarray_iris(): + features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] + (x_train, y_train), _ = get_iris_dataset() + model = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model.fit(x_train, y_train) + pred = model.predict(x_train) + + gen = GeneralizeToRepresentative(model, target_accuracy=0.7, features=features) + gen.fit(x_train, pred) + transformed = gen.transform(x_train) + gener = gen.generalizations_ + expexted_generalizations = { + 'ranges': {'sepal length (cm)': [5.0], 'sepal width (cm)': [], 'petal length (cm)': [4.950000047683716], + 'petal width (cm)': [0.800000011920929, 1.699999988079071]}, 'categories': {}, 'untouched': []} + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + modified_features = [f for f in features if + f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ + 'ranges'].keys()] + indexes = [] + for i in range(len(features)): + if features[i] in modified_features: + indexes.append(i) + assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all()) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[indexes]) != (x_train[indexes])).any()) + + +def test_minimize_pandas_nursery(): + (x_train, y_train), _ = get_nursery_dataset() + x_train = x_train.astype(str) + x_train.reset_index(inplace=True, drop=True) + y_train.reset_index(inplace=True, drop=True) + QI = ["finance", "social", "health"] + features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"] + categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children'] + numeric_features = [f for f in features if f not in categorical_features] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(x_train) + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + base_est.fit(encoded, y_train) + predictions = base_est.predict(encoded) + + gen = GeneralizeToRepresentative(base_est, target_accuracy=0.8, features=features, + categorical_features=categorical_features) + gen.fit(x_train, predictions) + transformed = gen.transform(x_train) + gener = gen.generalizations_ + expexted_generalizations = {'ranges': {}, 'categories': {'parents': [['great_pret', 'pretentious', 'usual']], + 'has_nurs': [['critical', 'less_proper', 'proper'], + ['very_crit'], ['improper']], 'form': [ + ['foster', 'completed', 'complete', 'incomplete']], 'housing': [['convenient', 'less_conv', 'critical']], + 'finance': [['convenient', 'inconv']], + 'social': [['problematic', 'nonprob', 'slightly_prob']], + 'health': [['priority'], ['recommended'], ['not_recom']], + 'children': [['2', '3', '4', '1']]}, 'untouched': []} + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + modified_features = [f for f in features if + f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ + 'ranges'].keys()] + assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1))) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[modified_features]).equals(x_train[modified_features])) == False) + + +def test_minimize_pandas_adult(): + (x_train, y_train), _ = get_adult_dataset() + x_train = x_train.head(5000) + y_train = y_train.head(5000) + + features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'] + + categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', + 'native-country'] + + numeric_features = [f for f in features if f not in categorical_features] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(x_train) + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + base_est.fit(encoded, y_train) + predictions = base_est.predict(encoded) + + gen = GeneralizeToRepresentative(base_est, target_accuracy=0.8, features=features, + categorical_features=categorical_features) + gen.fit(x_train, predictions) + transformed = gen.transform(x_train) + gener = gen.generalizations_ + expexted_generalizations = { + 'ranges': {'age': [20.0], 'education-num': [11.5, 12.5], 'capital-gain': [5095.5, 7139.5], 'capital-loss': [], + 'hours-per-week': []}, 'categories': {'workclass': [ + ['Private', 'Without-pay', 'Self-emp-not-inc', '?', 'Federal-gov', 'Self-emp-inc', 'State-gov', + 'Local-gov']], 'marital-status': [ + ['Married-civ-spouse', 'Never-married', 'Widowed', 'Married-AF-spouse', 'Separated', + 'Married-spouse-absent'], ['Divorced']], 'occupation': [ + ['Transport-moving', 'Priv-house-serv', '?', 'Armed-Forces', 'Prof-specialty', 'Farming-fishing', + 'Exec-managerial', 'Machine-op-inspct', 'Other-service', 'Sales', 'Protective-serv', 'Handlers-cleaners', + 'Tech-support', 'Craft-repair', 'Adm-clerical']], 'relationship': [ + ['Not-in-family', 'Own-child', 'Wife', 'Other-relative', 'Husband', 'Unmarried']], 'race': [ + ['Other', 'Asian-Pac-Islander', 'Black', 'White', 'Amer-Indian-Eskimo']], 'sex': [['Male', 'Female']], + 'native-country': [ + ['LatinAmerica', 'Other', 'UnitedStates', 'SouthAmerica', + 'BritishCommonwealth', 'Euro_2', 'Unknown', 'China', + 'Euro_1', 'SE_Asia']]}, 'untouched': []} + for key in expexted_generalizations['ranges']: + assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expexted_generalizations['categories']: + assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == + set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) + modified_features = [f for f in features if + f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ + 'ranges'].keys()] + assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1))) + ncp = gen.ncp_ + if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: + assert (ncp > 0) + assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)