From 2a657388af0655440548e7e8fc09bf698cb82943 Mon Sep 17 00:00:00 2001 From: abigailt Date: Wed, 15 Nov 2023 08:21:40 -0500 Subject: [PATCH] Initial version with first working test Signed-off-by: abigailt --- apt/minimization/minimizer.py | 81 +++++++++++++++++++++++++++++++---- tests/test_minimizer.py | 80 +++++++++++++++++++++++++++++++++- 2 files changed, 150 insertions(+), 11 deletions(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 0e2e3b8..53ef7c5 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -56,9 +56,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM :param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical features). If not provided, the data will be fed as is directly to the estimator. :type encoder: sklearn OrdinalEncoder or OneHotEncoder - :type categorical_features: list of strings, optional - :param features_to_minimize: The features to be minimized. + :type categorical_features: list of strings or integers, optional + :param features_to_minimize: The features to be minimized. If not provided, all features will be minimized. :type features_to_minimize: list of strings or int, optional + :param feature_slices: If some of the features to be minimized represent 1-hot encoded features that need to remain + consistent after minimization, provide a list containing the list of column names + or indexes that represent a single feature. + :type feature_slices: list of lists of strings or integers, optional :param train_only_features_to_minimize: Whether to train the tree just on the ``features_to_minimize`` or on all features. Default is only on ``features_to_minimize``. :type train_only_features_to_minimize: boolean, optional @@ -79,6 +83,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM categorical_features: Optional[Union[np.ndarray, list]] = None, encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None, features_to_minimize: Optional[Union[np.ndarray, list]] = None, + feature_slices: Optional[list] = None, train_only_features_to_minimize: Optional[bool] = True, is_regression: Optional[bool] = False, generalize_using_transform: bool = True): @@ -97,6 +102,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if categorical_features: self.categorical_features = categorical_features self.features_to_minimize = features_to_minimize + self.feature_slices = feature_slices + if self.feature_slices: + self.all_one_hot_features = set([str(feature) for encoded in self.feature_slices for feature in encoded]) + else: + self.all_one_hot_features = set() self.train_only_features_to_minimize = train_only_features_to_minimize self.is_regression = is_regression self.encoder = encoder @@ -121,6 +131,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM ret['target_accuracy'] = self.target_accuracy ret['categorical_features'] = self.categorical_features ret['features_to_minimize'] = self.features_to_minimize + ret['feature_slices'] = self.feature_slices ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize ret['is_regression'] = self.is_regression ret['estimator'] = self.estimator @@ -151,6 +162,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.categorical_features = params['categorical_features'] if 'features_to_minimize' in params: self.features_to_minimize = params['features_to_minimize'] + if 'feature_slices' in params: + self.feature_slices = params['feature_slices'] if 'train_only_features_to_minimize' in params: self.train_only_features_to_minimize = params['train_only_features_to_minimize'] if 'is_regression' in params: @@ -259,6 +272,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.features_to_minimize = [str(i) for i in self.features_to_minimize] if not all(elem in self._features for elem in self.features_to_minimize): raise ValueError('features to minimize should be a subset of features names') + if self.feature_slices: + temp_list = [] + for slice in self.feature_slices: + new_slice = [str(i) for i in slice] + if not all(elem in self._features for elem in new_slice): + raise ValueError('features in slices should be a subset of features names') + temp_list.append(new_slice) + self.feature_slices = temp_list x_qi = x.loc[:, self.features_to_minimize] # divide dataset into train and test @@ -703,6 +724,36 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # categorical feature can not have this value if categorical_value in new_cell['categories'][categorical_feature]: new_cell['categories'][categorical_feature].remove(categorical_value) + # features that were already one-hot encoded. Legal values should be 0 or 1 + elif feature in self.all_one_hot_features: + if feature not in new_cell['categories'].keys(): + new_cell['categories'][feature] = [] + if feature in cell['ranges']: + range = cell['ranges'][feature] + if range['start'] is None and range['end'] < 1: + feature_value = 0 + elif range['end'] is None and range['start'] > 0: + feature_value = 1 + elif range['start'] is not None and range['end'] is not None: + print(range) + new_cell['categories'][feature].append(feature_value) + + # need to add other columns that represent same 1-hot encoded feature + + # search for feature group: + for encoded in self.feature_slices: + if feature in encoded: + other_features = list(set(encoded) - set([feature])) + for other_feature in other_features: + if other_feature not in new_cell['categories'].keys(): + new_cell['categories'][other_feature] = [] + if feature_value == 1: + new_cell['categories'][other_feature].append(0) + elif len(encoded) == 2: + new_cell['categories'][other_feature].append(1) + else: + new_cell['categories'][other_feature].append(0) + new_cell['categories'][other_feature].append(1) else: if feature in cell['ranges'].keys(): new_cell['ranges'][feature] = cell['ranges'][feature] @@ -813,6 +864,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM min_dist = dist min = i i = i + 1 + # since this is an actual row from the data, correct one-hot encoding is already guaranteed row = match_rows.iloc[min] for feature in cell['ranges'].keys(): cell['representative'][feature] = row[feature] @@ -861,6 +913,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM new_dtypes = {} for t in dtypes.keys(): new_dtypes[t] = pd.Series(dtype=dtypes[t].name) + dtypes[t] = dtypes[t].name representatives = pd.DataFrame(new_dtypes) # empty except for columns original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True) @@ -891,6 +944,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM replace = pd.DataFrame(replace, indexes, columns=self._features) original_data_generalized.loc[indexes, representatives.columns.tolist()] = replace + original_data_generalized = original_data_generalized.astype(dtype=dtypes) return original_data_generalized def _generalize(self, data, data_prepared, nodes): @@ -1024,7 +1078,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM current_accuracy): new_cells = copy.deepcopy(self.cells) cells_by_id = copy.deepcopy(self._cells_by_id) - GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) + self._remove_feature_from_cells(new_cells, cells_by_id, feature) generalized = self._generalize_from_tree(original_data, prepared_data, nodes, new_cells, cells_by_id) accuracy = self._calculate_accuracy(generalized, labels, self.estimator, self.encoder) @@ -1229,16 +1283,25 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM untouched = untouched.intersection(*untouched_lists) return list(untouched) + def _remove_feature_from_cells(self, cells, cells_by_id, feature): + if feature in self.all_one_hot_features: + for encoded in self.feature_slices: + if feature in encoded: + self._remove_feature_from_cells_internal(cells, cells_by_id, encoded) + else: + self._remove_feature_from_cells_internal(cells, cells_by_id, [feature]) + @staticmethod - def _remove_feature_from_cells(cells, cells_by_id, feature): + def _remove_feature_from_cells_internal(cells, cells_by_id, features): for cell in cells: if 'untouched' not in cell: cell['untouched'] = [] - if feature in cell['ranges'].keys(): - del cell['ranges'][feature] - elif feature in cell['categories'].keys(): - del cell['categories'][feature] - cell['untouched'].append(feature) + for feature in features: + if feature in cell['ranges'].keys(): + del cell['ranges'][feature] + elif feature in cell['categories'].keys(): + del cell['categories'][feature] + cell['untouched'].append(feature) cells_by_id[cell['id']] = cell.copy() @staticmethod diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index ca34fbd..2cc141e 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -200,9 +200,9 @@ def check_features(features, expected_generalizations, transformed, x, pandas=Fa if features[i] in modified_features: indexes.append(i) if len(indexes) != transformed.shape[1]: - assert ((np.delete(transformed, indexes, axis=1) == np.delete(x, indexes, axis=1)).all()) + assert (np.array_equal(np.delete(transformed, indexes, axis=1), np.delete(x, indexes, axis=1))) if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0: - assert (((transformed[indexes]) != (x[indexes])).any()) + assert (not np.array_equal(transformed[:, indexes], x[:, indexes])) def check_ncp(ncp, expected_generalizations): @@ -920,6 +920,82 @@ def test_BaseEstimator_regression(diabetes_dataset): assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) +def test_minimizer_ndarray_one_hot(): + x_train = np.array([[23, 0, 1, 165], + [45, 0, 1, 158], + [56, 1, 0, 123], + [67, 0, 1, 154], + [45, 1, 0, 149], + [42, 1, 0, 166], + [73, 0, 1, 172], + [94, 0, 1, 168], + [69, 0, 1, 175], + [24, 1, 0, 181], + [18, 1, 0, 190]]) + y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + + model = DecisionTreeClassifier() + model.fit(x_train, y_train) + predictions = model.predict(x_train) + + features = ['0', '1', '2', '3'] + QI = [0, 1, 2] + QI_slices = [[1, 2]] + target_accuracy = 0.7 + gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices, + features_to_minimize=QI) + gen.fit(dataset=ArrayDataset(x_train, predictions)) + transformed = gen.transform(dataset=ArrayDataset(x_train)) + gener = gen.generalizations + expected_generalizations = {'categories': {}, 'category_representatives': {}, + 'range_representatives': {'0': [34.5]}, + 'ranges': {'0': [34.5]}, 'untouched': ['3', '1', '2']} + + compare_generalizations(gener, expected_generalizations) + + check_features(features, expected_generalizations, transformed, x_train) + ncp = gen.ncp.transform_score + check_ncp(ncp, expected_generalizations) + + rel_accuracy = model.score(transformed, predictions) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) + + +def test_anonymize_pandas_one_hot(): + feature_names = ["age", "gender_M", "gender_F", "height"] + x_train = np.array([[23, 0, 1, 165], + [45, 0, 1, 158], + [56, 1, 0, 123], + [67, 0, 1, 154], + [45, 1, 0, 149], + [42, 1, 0, 166], + [73, 0, 1, 172], + [94, 0, 1, 168], + [69, 0, 1, 175], + [24, 1, 0, 181], + [18, 1, 0, 190]]) + y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + x_train = pd.DataFrame(x_train, columns=feature_names) + y_train = pd.Series(y_train) + + model = DecisionTreeClassifier() + model.fit(x_train, y_train) + pred = model.predict(x_train) + + k = 10 + QI = ["age", "gender_M", "gender_F"] + QI_slices = [["gender_M", "gender_F"]] + anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices) + anon = anonymizer.anonymize(ArrayDataset(x_train, pred)) + assert (anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0]) + assert (anon.loc[:, QI].value_counts().min() >= k) + np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1)) + anonymized_slice = anon.loc[:, QI_slices[0]] + assert ((np.sum(anonymized_slice, axis=1) == 1).all()) + assert ((np.max(anonymized_slice, axis=1) == 1).all()) + assert ((np.min(anonymized_slice, axis=1) == 0).all()) + + def test_keras_model(): (x, y), (x_test, y_test) = get_iris_dataset_np()