From c2e0fced031e8fa06ae154ddc38dd30ab61877cb Mon Sep 17 00:00:00 2001 From: abigailt Date: Mon, 7 Aug 2023 15:59:22 +0300 Subject: [PATCH] Compute generalizations with test data when possible (for computing better representatives). Signed-off-by: abigailt --- apt/minimization/minimizer.py | 57 ++++++++++++++++++++--------------- tests/test_minimizer.py | 17 +++++++++++ 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 9d706ac..9681e29 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -325,7 +325,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes) # self._cells currently holds the generalization created from the tree leaves - self._calculate_generalizations() + self._calculate_generalizations(X_test) if generalize_using_transform: generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id) else: @@ -355,7 +355,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes) - self._calculate_generalizations() + self._calculate_generalizations(X_test) if generalize_using_transform: generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id) @@ -385,7 +385,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if removed_feature is None: break - self._calculate_generalizations() + self._calculate_generalizations(X_test) if generalize_using_transform: generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id) @@ -1084,6 +1084,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self._generalizations['ranges'], self._generalizations['categories']) # categorical - use most common value + old_category_representatives = category_representatives category_representatives = {} for feature in self._generalizations['categories']: category_representatives[feature] = [] @@ -1092,34 +1093,42 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # for c_index in range(len(group)): # indexes = [i for i, s in enumerate(sample_indexes) if s[feature][g_index] == c_index] indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == g_index] - rows = samples[indexes] - values = rows[:, feature] - category = Counter(values).most_common(1)[0][0] - category_representatives[feature].append(group[category]) - # c_count = len([s for s in sample_indexes if s[feature][g_index] == c_index]) - # if c_count > max_count: - # max_count = c_count - # category = c_index - # category_representatives[feature].append(group[category]) + if indexes: + rows = samples.iloc[indexes] + values = rows[feature] + category = Counter(values).most_common(1)[0][0] + category_representatives[feature].append(category) + # c_count = len([s for s in sample_indexes if s[feature][g_index] == c_index]) + # if c_count > max_count: + # max_count = c_count + # category = c_index + # category_representatives[feature].append(group[category]) + else: + category_representatives[feature].append(old_category_representatives[feature][g_index]) # numerical - use actual value closest to mean + old_range_representatives = range_representatives range_representatives = {} for feature in self._generalizations['ranges']: + range_representatives[feature] = [] # find the mean value (per feature) for index in range(len(self._generalizations['ranges'][feature])): indexes = [i for i, s in enumerate(sample_indexes) if s[feature] == index] - rows = samples[indexes] - values = rows[:, feature] - median = np.median(values) - min_value = max(values) - min_dist = float("inf") - for value in values: - # euclidean distance between two floating point values - dist = abs(value - median) - if dist < min_dist: - min_dist = dist - min_value = value - range_representatives[feature].append(min_value) + if indexes: + rows = samples.iloc[indexes] + values = rows[feature] + median = np.median(values) + min_value = max(values) + min_dist = float("inf") + for value in values: + # euclidean distance between two floating point values + dist = abs(value - median) + if dist < min_dist: + min_dist = dist + min_value = value + range_representatives[feature].append(min_value) + else: + range_representatives[feature].append(old_range_representatives[feature][index]) self._generalizations['category_representatives'] = category_representatives self._generalizations['range_representatives'] = range_representatives diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 0ac6111..47ab69b 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -49,7 +49,24 @@ def test_minimizer_params(): model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) model.fit(ArrayDataset(X, y)) + expected_generalizations = {'categories': {}, 'category_representatives': {}, + 'range_representatives': {'age': [38, 0.5, 40], 'height': [170, 0.5, 172]}, + 'ranges': {'age': [38, 39], 'height': [170, 171]}, 'untouched': []} + gen = GeneralizeToRepresentative(model, cells=cells) + gener = gen.generalizations + for key in expected_generalizations['ranges']: + assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key])) + for key in expected_generalizations['categories']: + assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) + == set([frozenset(sl) for sl in gener['categories'][key]])) + assert (set(expected_generalizations['untouched']) == set(gener['untouched'])) + for key in expected_generalizations['range_representatives']: + assert (set(expected_generalizations['range_representatives'][key]) == set(gener['range_representatives'][key])) + for key in expected_generalizations['category_representatives']: + assert (set([frozenset(sl) for sl in expected_generalizations['category_representatives'][key]]) + == set([frozenset(sl) for sl in gener['category_representatives'][key]])) + gen.fit() gen.transform(dataset=ArrayDataset(X, features_names=features))