From f646109e849ee9f54dd1502d1ac273efd28b69dd Mon Sep 17 00:00:00 2001
From: abigailt <abigailt@il.ibm.com>
Date: Sun, 24 Dec 2023 13:13:52 -0500
Subject: [PATCH] Replace values in multi-column 1-hot encoded features instead
 of appending so that options are narrowed down

Signed-off-by: abigailt <abigailt@il.ibm.com>
---
 apt/minimization/minimizer.py | 18 ++++++--------
 tests/test_minimizer.py       | 46 +++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py
index 39521d4..8f2d1b8 100644
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@@ -602,7 +602,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
             elif feature in cell['untouched']:
                 continue
             else:
-                raise TypeError("feature " + feature + "not found in cell" + cell['id'])
+                raise TypeError("feature " + str(feature) + " not found in cell " + str(cell['id']))
         # Mark as mapped
         mapped.itemset(index, 1)
         return True
@@ -736,24 +736,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                             feature_value = 0
                         elif range['end'] is None and range['start'] > 0:
                             feature_value = 1
-                        elif range['start'] is not None and range['end'] is not None:
-                            print(range)
-                        new_cell['categories'][feature].append(feature_value)
+                        new_cell['categories'][feature] = [feature_value]
 
                         # need to add other columns that represent same 1-hot encoded feature
 
                         # search for feature group:
                         other_features, encoded = self._get_other_features_in_encoding(feature, self.feature_slices)
                         for other_feature in other_features:
-                            if other_feature not in new_cell['categories'].keys():
-                                new_cell['categories'][other_feature] = []
                             if feature_value == 1:
-                                new_cell['categories'][other_feature].append(0)
+                                new_cell['categories'][other_feature] = [0]
                             elif len(encoded) == 2:
-                                new_cell['categories'][other_feature].append(1)
-                            else:
-                                new_cell['categories'][other_feature].append(0)
-                                new_cell['categories'][other_feature].append(1)
+                                new_cell['categories'][other_feature] = [1]
+                            elif (other_feature not in new_cell['categories'].keys() or
+                                  len(new_cell['categories'][other_feature]) == 0):
+                                new_cell['categories'][other_feature] = [0, 1]
                 else:
                     if feature in cell['ranges'].keys():
                         new_cell['ranges'][feature] = cell['ranges'][feature]
diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py
index ca69cf4..99d1a19 100644
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@@ -1060,6 +1060,52 @@ def test_minimizer_ndarray_one_hot_multi():
     assert ((np.min(transformed_slice, axis=1) == 0).all())
 
 
+def test_minimizer_ndarray_one_hot_multi2():
+    x_train = np.array([[0, 0, 1],
+                        [0, 0, 1],
+                        [0, 1, 0],
+                        [0, 1, 0],
+                        [1, 0, 0],
+                        [1, 0, 0]])
+    y_train = np.array([1, 1, 2, 2, 0, 0])
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    predictions = model.predict(x_train)
+
+    features = ['0', '1', '2']
+    QI = [0, 1, 2]
+    QI_slices = [[0, 1, 2]]
+    target_accuracy = 0.2
+    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, feature_slices=QI_slices,
+                                     features_to_minimize=QI)
+    gen.fit(dataset=ArrayDataset(x_train, predictions))
+    transformed = gen.transform(dataset=ArrayDataset(x_train))
+    gener = gen.generalizations
+    expected_generalizations = {'categories':
+                                    {'1': [[0, 1]], '2': [[0, 1]], '3': [[0, 1]], '4': [[0, 1]], '5': [[0, 1]]},
+                                'category_representatives': {'1': [0], '2': [1], '3': [0], '4': [1], '5': [0]},
+                                'range_representatives': {'0': []}, 'ranges': {'0': []}, 'untouched': ['6']}
+
+    compare_generalizations(gener, expected_generalizations)
+
+    check_features(features, expected_generalizations, transformed, x_train)
+    ncp = gen.ncp.transform_score
+    check_ncp(ncp, expected_generalizations)
+
+    rel_accuracy = model.score(transformed, predictions)
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
+    transformed_slice = transformed[:, QI_slices[0]]
+    assert ((np.sum(transformed_slice, axis=1) == 1).all())
+    assert ((np.max(transformed_slice, axis=1) == 1).all())
+    assert ((np.min(transformed_slice, axis=1) == 0).all())
+    transformed_slice = transformed[:, QI_slices[1]]
+    assert ((np.sum(transformed_slice, axis=1) == 1).all())
+    assert ((np.max(transformed_slice, axis=1) == 1).all())
+    assert ((np.min(transformed_slice, axis=1) == 0).all())
+
+
+
 def test_anonymize_pandas_one_hot():
     features = ["age", "gender_M", "gender_F", "height"]
     x_train = np.array([[23, 0, 1, 165],